This source file includes following definitions.
- str_unichar_iscombiningmark
- str_utf8_insert_replace_char
- str_utf8_is_valid_string
- str_utf8_is_valid_char
- str_utf8_cnext_char
- str_utf8_cprev_char
- str_utf8_cnext_char_safe
- str_utf8_cprev_char_safe
- str_utf8_fix_string
- str_utf8_isspace
- str_utf8_ispunct
- str_utf8_isalnum
- str_utf8_isdigit
- str_utf8_isprint
- str_utf8_iscombiningmark
- str_utf8_cnext_noncomb_char
- str_utf8_cprev_noncomb_char
- str_utf8_toupper
- str_utf8_tolower
- str_utf8_length
- str_utf8_length2
- str_utf8_length_noncomb
- str_utf8_questmark_sustb
- str_utf8_conv_gerror_message
- str_utf8_vfs_convert_to
- str_utf8_make_make_term_form
- str_utf8_term_form
- utf8_tool_copy_chars_to_end
- utf8_tool_copy_chars_to
- utf8_tool_insert_space
- utf8_tool_insert_char
- utf8_tool_skip_chars_to
- utf8_tool_compose
- str_utf8_fit_to_term
- str_utf8_term_trim
- str_utf8_term_width2
- str_utf8_term_width1
- str_utf8_term_char_width
- str_utf8_term_substring
- str_utf8_trunc
- str_utf8_offset_to_pos
- str_utf8_column_to_pos
- str_utf8_create_search_needle
- str_utf8_release_search_needle
- str_utf8_search_first
- str_utf8_search_last
- str_utf8_normalize
- str_utf8_casefold_normalize
- str_utf8_compare
- str_utf8_ncompare
- str_utf8_casecmp
- str_utf8_ncasecmp
- str_utf8_prefix
- str_utf8_caseprefix
- str_utf8_create_key_gen
- str_utf8_create_key
- str_utf8_create_key_for_filename
- str_utf8_key_collate
- str_utf8_release_key
- str_utf8_init
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 #include <config.h>
27
28 #include <stdlib.h>
29 #include <langinfo.h>
30 #include <string.h>
31
32 #include "lib/global.h"
33 #include "lib/strutil.h"
34
35
36
37
38
39
40
41
42
43 struct utf8_tool
44 {
45 char *actual;
46 size_t remain;
47 const char *checked;
48 int ident;
49 gboolean compose;
50 };
51
52 struct term_form
53 {
54 char text[BUF_MEDIUM * 6];
55 size_t width;
56 gboolean compose;
57 };
58
59
60
61 static const char replch[] = "\xEF\xBF\xBD";
62
63
64
65
66
67 static gboolean
68 str_unichar_iscombiningmark (gunichar uni)
69 {
70 GUnicodeType type;
71
72 type = g_unichar_type (uni);
73 return (type == G_UNICODE_SPACING_MARK)
74 || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
75 }
76
77
78
79 static void
80 str_utf8_insert_replace_char (GString * buffer)
81 {
82 g_string_append (buffer, replch);
83 }
84
85
86
87 static gboolean
88 str_utf8_is_valid_string (const char *text)
89 {
90 return g_utf8_validate (text, -1, NULL);
91 }
92
93
94
95 static int
96 str_utf8_is_valid_char (const char *ch, size_t size)
97 {
98 switch (g_utf8_get_char_validated (ch, size))
99 {
100 case (gunichar) (-2):
101 return (-2);
102 case (gunichar) (-1):
103 return (-1);
104 default:
105 return 1;
106 }
107 }
108
109
110
111 static void
112 str_utf8_cnext_char (const char **text)
113 {
114 (*text) = g_utf8_next_char (*text);
115 }
116
117
118
119 static void
120 str_utf8_cprev_char (const char **text)
121 {
122 (*text) = g_utf8_prev_char (*text);
123 }
124
125
126
127 static void
128 str_utf8_cnext_char_safe (const char **text)
129 {
130 if (str_utf8_is_valid_char (*text, -1) == 1)
131 (*text) = g_utf8_next_char (*text);
132 else
133 (*text)++;
134 }
135
136
137
138 static void
139 str_utf8_cprev_char_safe (const char **text)
140 {
141 const char *result, *t;
142
143 result = g_utf8_prev_char (*text);
144 t = result;
145 str_utf8_cnext_char_safe (&t);
146 if (t == *text)
147 (*text) = result;
148 else
149 (*text)--;
150 }
151
152
153
154 static void
155 str_utf8_fix_string (char *text)
156 {
157 while (text[0] != '\0')
158 {
159 gunichar uni;
160
161 uni = g_utf8_get_char_validated (text, -1);
162 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
163 text = g_utf8_next_char (text);
164 else
165 {
166 text[0] = '?';
167 text++;
168 }
169 }
170 }
171
172
173
174 static gboolean
175 str_utf8_isspace (const char *text)
176 {
177 gunichar uni;
178
179 uni = g_utf8_get_char_validated (text, -1);
180 return g_unichar_isspace (uni);
181 }
182
183
184
185 static gboolean
186 str_utf8_ispunct (const char *text)
187 {
188 gunichar uni;
189
190 uni = g_utf8_get_char_validated (text, -1);
191 return g_unichar_ispunct (uni);
192 }
193
194
195
196 static gboolean
197 str_utf8_isalnum (const char *text)
198 {
199 gunichar uni;
200
201 uni = g_utf8_get_char_validated (text, -1);
202 return g_unichar_isalnum (uni);
203 }
204
205
206
207 static gboolean
208 str_utf8_isdigit (const char *text)
209 {
210 gunichar uni;
211
212 uni = g_utf8_get_char_validated (text, -1);
213 return g_unichar_isdigit (uni);
214 }
215
216
217
218 static gboolean
219 str_utf8_isprint (const char *ch)
220 {
221 gunichar uni;
222
223 uni = g_utf8_get_char_validated (ch, -1);
224 return g_unichar_isprint (uni);
225 }
226
227
228
229 static gboolean
230 str_utf8_iscombiningmark (const char *ch)
231 {
232 gunichar uni;
233
234 uni = g_utf8_get_char_validated (ch, -1);
235 return str_unichar_iscombiningmark (uni);
236 }
237
238
239
240 static int
241 str_utf8_cnext_noncomb_char (const char **text)
242 {
243 int count = 0;
244
245 while ((*text)[0] != '\0')
246 {
247 str_utf8_cnext_char_safe (text);
248 count++;
249 if (!str_utf8_iscombiningmark (*text))
250 break;
251 }
252
253 return count;
254 }
255
256
257
258 static int
259 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
260 {
261 int count = 0;
262
263 while ((*text) != begin)
264 {
265 str_utf8_cprev_char_safe (text);
266 count++;
267 if (!str_utf8_iscombiningmark (*text))
268 break;
269 }
270
271 return count;
272 }
273
274
275
276 static gboolean
277 str_utf8_toupper (const char *text, char **out, size_t * remain)
278 {
279 gunichar uni;
280 size_t left;
281
282 uni = g_utf8_get_char_validated (text, -1);
283 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
284 return FALSE;
285
286 uni = g_unichar_toupper (uni);
287 left = g_unichar_to_utf8 (uni, NULL);
288 if (left >= *remain)
289 return FALSE;
290
291 left = g_unichar_to_utf8 (uni, *out);
292 (*out) += left;
293 (*remain) -= left;
294 return TRUE;
295 }
296
297
298
299 static gboolean
300 str_utf8_tolower (const char *text, char **out, size_t * remain)
301 {
302 gunichar uni;
303 size_t left;
304
305 uni = g_utf8_get_char_validated (text, -1);
306 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
307 return FALSE;
308
309 uni = g_unichar_tolower (uni);
310 left = g_unichar_to_utf8 (uni, NULL);
311 if (left >= *remain)
312 return FALSE;
313
314 left = g_unichar_to_utf8 (uni, *out);
315 (*out) += left;
316 (*remain) -= left;
317 return TRUE;
318 }
319
320
321
322 static int
323 str_utf8_length (const char *text)
324 {
325 int result = 0;
326 const char *start;
327 const char *end;
328
329 start = text;
330 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
331 {
332 if (start != end)
333 result += g_utf8_strlen (start, end - start);
334
335 result++;
336 start = end + 1;
337 }
338
339 if (start == text)
340 result = g_utf8_strlen (text, -1);
341 else if (start[0] != '\0' && start != end)
342 result += g_utf8_strlen (start, end - start);
343
344 return result;
345 }
346
347
348
349 static int
350 str_utf8_length2 (const char *text, int size)
351 {
352 int result = 0;
353 const char *start;
354 const char *end;
355
356 start = text;
357 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
358 {
359 if (start != end)
360 {
361 result += g_utf8_strlen (start, MIN (end - start, size));
362 size -= end - start;
363 }
364 result += (size > 0);
365 size--;
366 start = end + 1;
367 }
368
369 if (start == text)
370 result = g_utf8_strlen (text, size);
371 else if (start[0] != '\0' && start != end && size > 0)
372 result += g_utf8_strlen (start, MIN (end - start, size));
373
374 return result;
375 }
376
377
378
379 static int
380 str_utf8_length_noncomb (const char *text)
381 {
382 int result = 0;
383 const char *t = text;
384
385 while (t[0] != '\0')
386 {
387 str_utf8_cnext_noncomb_char (&t);
388 result++;
389 }
390
391 return result;
392 }
393
394
395
396 #if 0
397 static void
398 str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
399 {
400 char *next;
401
402 next = g_utf8_next_char (*string);
403 (*left) -= next - (*string);
404 (*string) = next;
405 g_string_append_c (buffer, '?');
406 }
407 #endif
408
409
410
411 static gchar *
412 str_utf8_conv_gerror_message (GError * mcerror, const char *def_msg)
413 {
414 if (mcerror != NULL)
415 return g_strdup (mcerror->message);
416
417 return g_strdup (def_msg != NULL ? def_msg : "");
418 }
419
420
421
422 static estr_t
423 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
424 {
425 estr_t result = ESTR_SUCCESS;
426
427 if (coder == str_cnv_not_convert)
428 g_string_append_len (buffer, string, size);
429 else
430 result = str_nconvert (coder, string, size, buffer);
431
432 return result;
433 }
434
435
436
437
438
439 static const struct term_form *
440 str_utf8_make_make_term_form (const char *text, size_t length)
441 {
442 static struct term_form result;
443 gunichar uni;
444 size_t left;
445 char *actual;
446
447 result.text[0] = '\0';
448 result.width = 0;
449 result.compose = FALSE;
450 actual = result.text;
451
452
453
454 if (length != 0 && text[0] != '\0')
455 {
456 uni = g_utf8_get_char_validated (text, -1);
457 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
458 && str_unichar_iscombiningmark (uni))
459 {
460 actual[0] = ' ';
461 actual++;
462 result.width++;
463 result.compose = TRUE;
464 }
465 }
466
467 while (length != 0 && text[0] != '\0')
468 {
469 uni = g_utf8_get_char_validated (text, -1);
470 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
471 {
472 if (g_unichar_isprint (uni))
473 {
474 left = g_unichar_to_utf8 (uni, actual);
475 actual += left;
476 if (str_unichar_iscombiningmark (uni))
477 result.compose = TRUE;
478 else
479 {
480 result.width++;
481 if (g_unichar_iswide (uni))
482 result.width++;
483 }
484 }
485 else
486 {
487 actual[0] = '.';
488 actual++;
489 result.width++;
490 }
491 text = g_utf8_next_char (text);
492 }
493 else
494 {
495 text++;
496
497 memcpy (actual, replch, strlen (replch));
498 actual += strlen (replch);
499 result.width++;
500 }
501
502 if (length != (size_t) (-1))
503 length--;
504 }
505 actual[0] = '\0';
506
507 return &result;
508 }
509
510
511
512 static const char *
513 str_utf8_term_form (const char *text)
514 {
515 static char result[BUF_MEDIUM * 6];
516 const struct term_form *pre_form;
517
518 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
519 if (pre_form->compose)
520 {
521 char *composed;
522
523 composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
524 g_strlcpy (result, composed, sizeof (result));
525 g_free (composed);
526 }
527 else
528 g_strlcpy (result, pre_form->text, sizeof (result));
529
530 return result;
531 }
532
533
534
535
536 static gboolean
537 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
538 {
539 tool->compose = FALSE;
540
541 while (tool->checked[0] != '\0')
542 {
543 gunichar uni;
544 size_t left;
545
546 uni = g_utf8_get_char (tool->checked);
547 tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
548 left = g_unichar_to_utf8 (uni, NULL);
549 if (tool->remain <= left)
550 return FALSE;
551 left = g_unichar_to_utf8 (uni, tool->actual);
552 tool->actual += left;
553 tool->remain -= left;
554 tool->checked = g_utf8_next_char (tool->checked);
555 }
556
557 return TRUE;
558 }
559
560
561
562
563
564 static gboolean
565 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
566 {
567 tool->compose = FALSE;
568
569 while (tool->checked[0] != '\0')
570 {
571 gunichar uni;
572 size_t left;
573 int w = 0;
574
575 uni = g_utf8_get_char (tool->checked);
576 if (str_unichar_iscombiningmark (uni))
577 tool->compose = TRUE;
578 else
579 {
580 w = 1;
581 if (g_unichar_iswide (uni))
582 w++;
583 if (tool->ident + w > to_ident)
584 return TRUE;
585 }
586
587 left = g_unichar_to_utf8 (uni, NULL);
588 if (tool->remain <= left)
589 return FALSE;
590 left = g_unichar_to_utf8 (uni, tool->actual);
591 tool->actual += left;
592 tool->remain -= left;
593 tool->checked = g_utf8_next_char (tool->checked);
594 tool->ident += w;
595 }
596
597 return TRUE;
598 }
599
600
601
602
603 static int
604 utf8_tool_insert_space (struct utf8_tool *tool, int count)
605 {
606 if (count <= 0)
607 return 1;
608 if (tool->remain <= (gsize) count)
609 return 0;
610
611 memset (tool->actual, ' ', count);
612 tool->actual += count;
613 tool->remain -= count;
614 return 1;
615 }
616
617
618
619
620 static int
621 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
622 {
623 if (tool->remain <= 1)
624 return 0;
625
626 tool->actual[0] = ch;
627 tool->actual++;
628 tool->remain--;
629 return 1;
630 }
631
632
633
634
635
636 static gboolean
637 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
638 {
639 gunichar uni;
640
641 while (to_ident > tool->ident && tool->checked[0] != '\0')
642 {
643 uni = g_utf8_get_char (tool->checked);
644 if (!str_unichar_iscombiningmark (uni))
645 {
646 tool->ident++;
647 if (g_unichar_iswide (uni))
648 tool->ident++;
649 }
650 tool->checked = g_utf8_next_char (tool->checked);
651 }
652
653 uni = g_utf8_get_char (tool->checked);
654 while (str_unichar_iscombiningmark (uni))
655 {
656 tool->checked = g_utf8_next_char (tool->checked);
657 uni = g_utf8_get_char (tool->checked);
658 }
659
660 return TRUE;
661 }
662
663
664
665 static void
666 utf8_tool_compose (char *buffer, size_t size)
667 {
668 char *composed;
669
670 composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
671 g_strlcpy (buffer, composed, size);
672 g_free (composed);
673 }
674
675
676
677 static const char *
678 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
679 {
680 static char result[BUF_MEDIUM * 6];
681 const struct term_form *pre_form;
682 struct utf8_tool tool;
683
684 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
685 tool.checked = pre_form->text;
686 tool.actual = result;
687 tool.remain = sizeof (result);
688 tool.compose = FALSE;
689
690 if (pre_form->width <= (gsize) width)
691 {
692 switch (HIDE_FIT (just_mode))
693 {
694 case J_CENTER_LEFT:
695 case J_CENTER:
696 tool.ident = (width - pre_form->width) / 2;
697 break;
698 case J_RIGHT:
699 tool.ident = width - pre_form->width;
700 break;
701 default:
702 tool.ident = 0;
703 break;
704 }
705
706 utf8_tool_insert_space (&tool, tool.ident);
707 utf8_tool_copy_chars_to_end (&tool);
708 utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
709 }
710 else if (IS_FIT (just_mode))
711 {
712 tool.ident = 0;
713 utf8_tool_copy_chars_to (&tool, width / 2);
714 utf8_tool_insert_char (&tool, '~');
715
716 tool.ident = 0;
717 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
718 utf8_tool_copy_chars_to_end (&tool);
719 utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
720 }
721 else
722 {
723 switch (HIDE_FIT (just_mode))
724 {
725 case J_CENTER:
726 tool.ident = (width - pre_form->width) / 2;
727 break;
728 case J_RIGHT:
729 tool.ident = width - pre_form->width;
730 break;
731 default:
732 tool.ident = 0;
733 break;
734 }
735
736 utf8_tool_skip_chars_to (&tool, 0);
737 utf8_tool_insert_space (&tool, tool.ident);
738 utf8_tool_copy_chars_to (&tool, width);
739 utf8_tool_insert_space (&tool, width - tool.ident);
740 }
741
742 tool.actual[0] = '\0';
743 if (tool.compose)
744 utf8_tool_compose (result, sizeof (result));
745 return result;
746 }
747
748
749
750 static const char *
751 str_utf8_term_trim (const char *text, int width)
752 {
753 static char result[BUF_MEDIUM * 6];
754 const struct term_form *pre_form;
755 struct utf8_tool tool;
756
757 if (width < 1)
758 {
759 result[0] = '\0';
760 return result;
761 }
762
763 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
764
765 tool.checked = pre_form->text;
766 tool.actual = result;
767 tool.remain = sizeof (result);
768 tool.compose = FALSE;
769
770 if ((gsize) width >= pre_form->width)
771 utf8_tool_copy_chars_to_end (&tool);
772 else if (width <= 3)
773 {
774 memset (tool.actual, '.', width);
775 tool.actual += width;
776 tool.remain -= width;
777 }
778 else
779 {
780 memset (tool.actual, '.', 3);
781 tool.actual += 3;
782 tool.remain -= 3;
783
784 tool.ident = 0;
785 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
786 utf8_tool_copy_chars_to_end (&tool);
787 }
788
789 tool.actual[0] = '\0';
790 if (tool.compose)
791 utf8_tool_compose (result, sizeof (result));
792 return result;
793 }
794
795
796
797 static int
798 str_utf8_term_width2 (const char *text, size_t length)
799 {
800 const struct term_form *result;
801
802 result = str_utf8_make_make_term_form (text, length);
803 return result->width;
804 }
805
806
807
808 static int
809 str_utf8_term_width1 (const char *text)
810 {
811 return str_utf8_term_width2 (text, (size_t) (-1));
812 }
813
814
815
816 static int
817 str_utf8_term_char_width (const char *text)
818 {
819 gunichar uni;
820
821 uni = g_utf8_get_char_validated (text, -1);
822 return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
823 }
824
825
826
827 static const char *
828 str_utf8_term_substring (const char *text, int start, int width)
829 {
830 static char result[BUF_MEDIUM * 6];
831 const struct term_form *pre_form;
832 struct utf8_tool tool;
833
834 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
835
836 tool.checked = pre_form->text;
837 tool.actual = result;
838 tool.remain = sizeof (result);
839 tool.compose = FALSE;
840
841 tool.ident = -start;
842 utf8_tool_skip_chars_to (&tool, 0);
843 if (tool.ident < 0)
844 tool.ident = 0;
845 utf8_tool_insert_space (&tool, tool.ident);
846
847 utf8_tool_copy_chars_to (&tool, width);
848 utf8_tool_insert_space (&tool, width - tool.ident);
849
850 tool.actual[0] = '\0';
851 if (tool.compose)
852 utf8_tool_compose (result, sizeof (result));
853 return result;
854 }
855
856
857
858 static const char *
859 str_utf8_trunc (const char *text, int width)
860 {
861 static char result[MC_MAXPATHLEN * 6 * 2];
862 const struct term_form *pre_form;
863 struct utf8_tool tool;
864
865 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
866
867 tool.checked = pre_form->text;
868 tool.actual = result;
869 tool.remain = sizeof (result);
870 tool.compose = FALSE;
871
872 if (pre_form->width <= (gsize) width)
873 utf8_tool_copy_chars_to_end (&tool);
874 else
875 {
876 tool.ident = 0;
877 utf8_tool_copy_chars_to (&tool, width / 2);
878 utf8_tool_insert_char (&tool, '~');
879
880 tool.ident = 0;
881 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
882 utf8_tool_copy_chars_to_end (&tool);
883 }
884
885 tool.actual[0] = '\0';
886 if (tool.compose)
887 utf8_tool_compose (result, sizeof (result));
888 return result;
889 }
890
891
892
893 static int
894 str_utf8_offset_to_pos (const char *text, size_t length)
895 {
896 if (str_utf8_is_valid_string (text))
897 return g_utf8_offset_to_pointer (text, length) - text;
898 else
899 {
900 int result;
901 GString *buffer;
902
903 buffer = g_string_new (text);
904 str_utf8_fix_string (buffer->str);
905 result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
906 g_string_free (buffer, TRUE);
907 return result;
908 }
909 }
910
911
912
913 static int
914 str_utf8_column_to_pos (const char *text, size_t pos)
915 {
916 int result = 0;
917 int width = 0;
918
919 while (text[0] != '\0')
920 {
921 gunichar uni;
922
923 uni = g_utf8_get_char_validated (text, 6);
924 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
925 {
926 if (g_unichar_isprint (uni))
927 {
928 if (!str_unichar_iscombiningmark (uni))
929 {
930 width++;
931 if (g_unichar_iswide (uni))
932 width++;
933 }
934 }
935 else
936 {
937 width++;
938 }
939 text = g_utf8_next_char (text);
940 }
941 else
942 {
943 text++;
944 width++;
945 }
946
947 if ((gsize) width > pos)
948 return result;
949
950 result++;
951 }
952
953 return result;
954 }
955
956
957
958 static char *
959 str_utf8_create_search_needle (const char *needle, gboolean case_sen)
960 {
961 char *fold, *result;
962
963 if (needle == NULL)
964 return NULL;
965
966 if (case_sen)
967 return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
968
969 fold = g_utf8_casefold (needle, -1);
970 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
971 g_free (fold);
972 return result;
973 }
974
975
976
977 static void
978 str_utf8_release_search_needle (char *needle, gboolean case_sen)
979 {
980 (void) case_sen;
981 g_free (needle);
982 }
983
984
985
986 static const char *
987 str_utf8_search_first (const char *text, const char *search, gboolean case_sen)
988 {
989 char *fold_text;
990 char *deco_text;
991 const char *match;
992 const char *result = NULL;
993 const char *m;
994
995 fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
996 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
997
998 match = deco_text;
999 do
1000 {
1001 match = g_strstr_len (match, -1, search);
1002 if (match != NULL)
1003 {
1004 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1005 !str_utf8_iscombiningmark (match + strlen (search)))
1006 {
1007 result = text;
1008 m = deco_text;
1009 while (m < match)
1010 {
1011 str_utf8_cnext_noncomb_char (&m);
1012 str_utf8_cnext_noncomb_char (&result);
1013 }
1014 }
1015 else
1016 str_utf8_cnext_char (&match);
1017 }
1018 }
1019 while (match != NULL && result == NULL);
1020
1021 g_free (deco_text);
1022 if (!case_sen)
1023 g_free (fold_text);
1024
1025 return result;
1026 }
1027
1028
1029
1030 static const char *
1031 str_utf8_search_last (const char *text, const char *search, gboolean case_sen)
1032 {
1033 char *fold_text;
1034 char *deco_text;
1035 char *match;
1036 const char *result = NULL;
1037 const char *m;
1038
1039 fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
1040 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1041
1042 do
1043 {
1044 match = g_strrstr_len (deco_text, -1, search);
1045 if (match != NULL)
1046 {
1047 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1048 !str_utf8_iscombiningmark (match + strlen (search)))
1049 {
1050 result = text;
1051 m = deco_text;
1052 while (m < match)
1053 {
1054 str_utf8_cnext_noncomb_char (&m);
1055 str_utf8_cnext_noncomb_char (&result);
1056 }
1057 }
1058 else
1059 match[0] = '\0';
1060 }
1061 }
1062 while (match != NULL && result == NULL);
1063
1064 g_free (deco_text);
1065 if (!case_sen)
1066 g_free (fold_text);
1067
1068 return result;
1069 }
1070
1071
1072
1073 static char *
1074 str_utf8_normalize (const char *text)
1075 {
1076 GString *fixed;
1077 char *tmp;
1078 char *result;
1079 const char *start;
1080 const char *end;
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090 for (end = text; *end != '\0'; end++)
1091 if ((*end & 0x80) != 0)
1092 {
1093
1094 break;
1095 }
1096
1097
1098 if (*end == '\0')
1099 return g_strndup (text, end - text);
1100
1101 fixed = g_string_sized_new (4);
1102
1103 start = text;
1104 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1105 {
1106 if (start != end)
1107 {
1108 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1109 g_string_append (fixed, tmp);
1110 g_free (tmp);
1111 }
1112 g_string_append_c (fixed, end[0]);
1113 start = end + 1;
1114 }
1115
1116 if (start == text)
1117 {
1118 result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1119 g_string_free (fixed, TRUE);
1120 }
1121 else
1122 {
1123 if (start[0] != '\0' && start != end)
1124 {
1125 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1126 g_string_append (fixed, tmp);
1127 g_free (tmp);
1128 }
1129 result = g_string_free (fixed, FALSE);
1130 }
1131
1132 return result;
1133 }
1134
1135
1136
1137 static char *
1138 str_utf8_casefold_normalize (const char *text)
1139 {
1140 GString *fixed;
1141 char *tmp, *fold;
1142 char *result;
1143 const char *start;
1144 const char *end;
1145
1146 fixed = g_string_sized_new (4);
1147
1148 start = text;
1149 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1150 {
1151 if (start != end)
1152 {
1153 fold = g_utf8_casefold (start, end - start);
1154 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1155 g_string_append (fixed, tmp);
1156 g_free (tmp);
1157 g_free (fold);
1158 }
1159 g_string_append_c (fixed, end[0]);
1160 start = end + 1;
1161 }
1162
1163 if (start == text)
1164 {
1165 fold = g_utf8_casefold (text, -1);
1166 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1167 g_free (fold);
1168 g_string_free (fixed, TRUE);
1169 }
1170 else
1171 {
1172 if (start[0] != '\0' && start != end)
1173 {
1174 fold = g_utf8_casefold (start, end - start);
1175 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1176 g_string_append (fixed, tmp);
1177 g_free (tmp);
1178 g_free (fold);
1179 }
1180 result = g_string_free (fixed, FALSE);
1181 }
1182
1183 return result;
1184 }
1185
1186
1187
1188 static int
1189 str_utf8_compare (const char *t1, const char *t2)
1190 {
1191 char *n1, *n2;
1192 int result;
1193
1194 n1 = str_utf8_normalize (t1);
1195 n2 = str_utf8_normalize (t2);
1196
1197 result = strcmp (n1, n2);
1198
1199 g_free (n1);
1200 g_free (n2);
1201
1202 return result;
1203 }
1204
1205
1206
1207 static int
1208 str_utf8_ncompare (const char *t1, const char *t2)
1209 {
1210 char *n1, *n2;
1211 size_t l1, l2;
1212 int result;
1213
1214 n1 = str_utf8_normalize (t1);
1215 n2 = str_utf8_normalize (t2);
1216
1217 l1 = strlen (n1);
1218 l2 = strlen (n2);
1219 result = strncmp (n1, n2, MIN (l1, l2));
1220
1221 g_free (n1);
1222 g_free (n2);
1223
1224 return result;
1225 }
1226
1227
1228
1229 static int
1230 str_utf8_casecmp (const char *t1, const char *t2)
1231 {
1232 char *n1, *n2;
1233 int result;
1234
1235 n1 = str_utf8_casefold_normalize (t1);
1236 n2 = str_utf8_casefold_normalize (t2);
1237
1238 result = strcmp (n1, n2);
1239
1240 g_free (n1);
1241 g_free (n2);
1242
1243 return result;
1244 }
1245
1246
1247
1248 static int
1249 str_utf8_ncasecmp (const char *t1, const char *t2)
1250 {
1251 char *n1, *n2;
1252 size_t l1, l2;
1253 int result;
1254
1255 n1 = str_utf8_casefold_normalize (t1);
1256 n2 = str_utf8_casefold_normalize (t2);
1257
1258 l1 = strlen (n1);
1259 l2 = strlen (n2);
1260 result = strncmp (n1, n2, MIN (l1, l2));
1261
1262 g_free (n1);
1263 g_free (n2);
1264
1265 return result;
1266 }
1267
1268
1269
1270 static int
1271 str_utf8_prefix (const char *text, const char *prefix)
1272 {
1273 char *t, *p;
1274 const char *nt, *np;
1275 const char *nnt, *nnp;
1276 int result;
1277
1278 t = str_utf8_normalize (text);
1279 p = str_utf8_normalize (prefix);
1280 nt = t;
1281 np = p;
1282 nnt = t;
1283 nnp = p;
1284
1285 while (nt[0] != '\0' && np[0] != '\0')
1286 {
1287 str_utf8_cnext_char_safe (&nnt);
1288 str_utf8_cnext_char_safe (&nnp);
1289 if (nnt - nt != nnp - np)
1290 break;
1291 if (strncmp (nt, np, nnt - nt) != 0)
1292 break;
1293 nt = nnt;
1294 np = nnp;
1295 }
1296
1297 result = np - p;
1298
1299 g_free (t);
1300 g_free (p);
1301
1302 return result;
1303 }
1304
1305
1306
1307 static int
1308 str_utf8_caseprefix (const char *text, const char *prefix)
1309 {
1310 char *t, *p;
1311 const char *nt, *np;
1312 const char *nnt, *nnp;
1313 int result;
1314
1315 t = str_utf8_casefold_normalize (text);
1316 p = str_utf8_casefold_normalize (prefix);
1317 nt = t;
1318 np = p;
1319 nnt = t;
1320 nnp = p;
1321
1322 while (nt[0] != '\0' && np[0] != '\0')
1323 {
1324 str_utf8_cnext_char_safe (&nnt);
1325 str_utf8_cnext_char_safe (&nnp);
1326 if (nnt - nt != nnp - np)
1327 break;
1328 if (strncmp (nt, np, nnt - nt) != 0)
1329 break;
1330 nt = nnt;
1331 np = nnp;
1332 }
1333
1334 result = np - p;
1335
1336 g_free (t);
1337 g_free (p);
1338
1339 return result;
1340 }
1341
1342
1343
1344 static char *
1345 str_utf8_create_key_gen (const char *text, gboolean case_sen,
1346 gchar * (*keygen) (const gchar * text, gssize size))
1347 {
1348 char *result;
1349
1350 if (case_sen)
1351 result = str_utf8_normalize (text);
1352 else
1353 {
1354 gboolean dot;
1355 GString *fixed;
1356 const char *start, *end;
1357 char *fold, *key;
1358
1359 dot = text[0] == '.';
1360 fixed = g_string_sized_new (16);
1361
1362 if (!dot)
1363 start = text;
1364 else
1365 {
1366 start = text + 1;
1367 g_string_append_c (fixed, '.');
1368 }
1369
1370 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1371 {
1372 if (start != end)
1373 {
1374 fold = g_utf8_casefold (start, end - start);
1375 key = keygen (fold, -1);
1376 g_string_append (fixed, key);
1377 g_free (key);
1378 g_free (fold);
1379 }
1380 g_string_append_c (fixed, end[0]);
1381 start = end + 1;
1382 }
1383
1384 if (start == text)
1385 {
1386 fold = g_utf8_casefold (start, -1);
1387 result = keygen (fold, -1);
1388 g_free (fold);
1389 g_string_free (fixed, TRUE);
1390 }
1391 else if (dot && (start == text + 1))
1392 {
1393 fold = g_utf8_casefold (start, -1);
1394 key = keygen (fold, -1);
1395 g_string_append (fixed, key);
1396 g_free (key);
1397 g_free (fold);
1398 result = g_string_free (fixed, FALSE);
1399 }
1400 else
1401 {
1402 if (start[0] != '\0' && start != end)
1403 {
1404 fold = g_utf8_casefold (start, end - start);
1405 key = keygen (fold, -1);
1406 g_string_append (fixed, key);
1407 g_free (key);
1408 g_free (fold);
1409 }
1410 result = g_string_free (fixed, FALSE);
1411 }
1412 }
1413 return result;
1414 }
1415
1416
1417
1418 static char *
1419 str_utf8_create_key (const char *text, gboolean case_sen)
1420 {
1421 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1422 }
1423
1424
1425
1426 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1427 static char *
1428 str_utf8_create_key_for_filename (const char *text, gboolean case_sen)
1429 {
1430 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1431 }
1432 #endif
1433
1434
1435
1436 static int
1437 str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen)
1438 {
1439 (void) case_sen;
1440 return strcmp (t1, t2);
1441 }
1442
1443
1444
1445 static void
1446 str_utf8_release_key (char *key, gboolean case_sen)
1447 {
1448 (void) case_sen;
1449 g_free (key);
1450 }
1451
1452
1453
1454
1455
1456 struct str_class
1457 str_utf8_init (void)
1458 {
1459 struct str_class result;
1460
1461 result.conv_gerror_message = str_utf8_conv_gerror_message;
1462 result.vfs_convert_to = str_utf8_vfs_convert_to;
1463 result.insert_replace_char = str_utf8_insert_replace_char;
1464 result.is_valid_string = str_utf8_is_valid_string;
1465 result.is_valid_char = str_utf8_is_valid_char;
1466 result.cnext_char = str_utf8_cnext_char;
1467 result.cprev_char = str_utf8_cprev_char;
1468 result.cnext_char_safe = str_utf8_cnext_char_safe;
1469 result.cprev_char_safe = str_utf8_cprev_char_safe;
1470 result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1471 result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1472 result.char_isspace = str_utf8_isspace;
1473 result.char_ispunct = str_utf8_ispunct;
1474 result.char_isalnum = str_utf8_isalnum;
1475 result.char_isdigit = str_utf8_isdigit;
1476 result.char_isprint = str_utf8_isprint;
1477 result.char_iscombiningmark = str_utf8_iscombiningmark;
1478 result.char_toupper = str_utf8_toupper;
1479 result.char_tolower = str_utf8_tolower;
1480 result.length = str_utf8_length;
1481 result.length2 = str_utf8_length2;
1482 result.length_noncomb = str_utf8_length_noncomb;
1483 result.fix_string = str_utf8_fix_string;
1484 result.term_form = str_utf8_term_form;
1485 result.fit_to_term = str_utf8_fit_to_term;
1486 result.term_trim = str_utf8_term_trim;
1487 result.term_width2 = str_utf8_term_width2;
1488 result.term_width1 = str_utf8_term_width1;
1489 result.term_char_width = str_utf8_term_char_width;
1490 result.term_substring = str_utf8_term_substring;
1491 result.trunc = str_utf8_trunc;
1492 result.offset_to_pos = str_utf8_offset_to_pos;
1493 result.column_to_pos = str_utf8_column_to_pos;
1494 result.create_search_needle = str_utf8_create_search_needle;
1495 result.release_search_needle = str_utf8_release_search_needle;
1496 result.search_first = str_utf8_search_first;
1497 result.search_last = str_utf8_search_last;
1498 result.compare = str_utf8_compare;
1499 result.ncompare = str_utf8_ncompare;
1500 result.casecmp = str_utf8_casecmp;
1501 result.ncasecmp = str_utf8_ncasecmp;
1502 result.prefix = str_utf8_prefix;
1503 result.caseprefix = str_utf8_caseprefix;
1504 result.create_key = str_utf8_create_key;
1505 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1506
1507 result.create_key_for_filename = str_utf8_create_key_for_filename;
1508 #else
1509
1510 result.create_key_for_filename = str_utf8_create_key;
1511 #endif
1512 result.key_collate = str_utf8_key_collate;
1513 result.release_key = str_utf8_release_key;
1514
1515 return result;
1516 }
1517
1518