This source file includes following definitions.
- str_unichar_iscombiningmark
- str_utf8_insert_replace_char
- str_utf8_is_valid_string
- str_utf8_is_valid_char
- str_utf8_cnext_char
- str_utf8_cprev_char
- str_utf8_cnext_char_safe
- str_utf8_cprev_char_safe
- str_utf8_fix_string
- str_utf8_isspace
- str_utf8_ispunct
- str_utf8_isalnum
- str_utf8_isdigit
- str_utf8_isprint
- str_utf8_iscombiningmark
- str_utf8_cnext_noncomb_char
- str_utf8_cprev_noncomb_char
- str_utf8_toupper
- str_utf8_tolower
- str_utf8_length
- str_utf8_length2
- str_utf8_length_noncomb
- str_utf8_questmark_sustb
- str_utf8_conv_gerror_message
- str_utf8_vfs_convert_to
- str_utf8_make_make_term_form
- str_utf8_term_form
- utf8_tool_copy_chars_to_end
- utf8_tool_copy_chars_to
- utf8_tool_insert_space
- utf8_tool_insert_char
- utf8_tool_skip_chars_to
- utf8_tool_compose
- str_utf8_fit_to_term
- str_utf8_term_trim
- str_utf8_term_width2
- str_utf8_term_width1
- str_utf8_term_char_width
- str_utf8_term_substring
- str_utf8_trunc
- str_utf8_offset_to_pos
- str_utf8_column_to_pos
- str_utf8_create_search_needle
- str_utf8_release_search_needle
- str_utf8_search_first
- str_utf8_search_last
- str_utf8_normalize
- str_utf8_casefold_normalize
- str_utf8_compare
- str_utf8_ncompare
- str_utf8_casecmp
- str_utf8_ncasecmp
- str_utf8_prefix
- str_utf8_caseprefix
- str_utf8_create_key_gen
- str_utf8_create_key
- str_utf8_create_key_for_filename
- str_utf8_key_collate
- str_utf8_release_key
- str_utf8_init
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 #include <config.h>
27
28 #include <stdlib.h>
29 #include <langinfo.h>
30 #include <limits.h>
31 #include <string.h>
32
33 #include "lib/global.h"
34 #include "lib/strutil.h"
35
36
37
38
39
40
41
42
43
44 struct utf8_tool
45 {
46 char *actual;
47 size_t remain;
48 const char *checked;
49 int ident;
50 gboolean compose;
51 };
52
53 struct term_form
54 {
55 char text[BUF_MEDIUM * MB_LEN_MAX];
56 size_t width;
57 gboolean compose;
58 };
59
60
61
62 static const char replch[] = "\xEF\xBF\xBD";
63
64
65
66
67
68 static gboolean
69 str_unichar_iscombiningmark (gunichar uni)
70 {
71 GUnicodeType type;
72
73 type = g_unichar_type (uni);
74 return (type == G_UNICODE_SPACING_MARK)
75 || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
76 }
77
78
79
80 static void
81 str_utf8_insert_replace_char (GString * buffer)
82 {
83 g_string_append (buffer, replch);
84 }
85
86
87
88 static gboolean
89 str_utf8_is_valid_string (const char *text)
90 {
91 return g_utf8_validate (text, -1, NULL);
92 }
93
94
95
96 static int
97 str_utf8_is_valid_char (const char *ch, size_t size)
98 {
99 switch (g_utf8_get_char_validated (ch, size))
100 {
101 case (gunichar) (-2):
102 return (-2);
103 case (gunichar) (-1):
104 return (-1);
105 default:
106 return 1;
107 }
108 }
109
110
111
112 static void
113 str_utf8_cnext_char (const char **text)
114 {
115 (*text) = g_utf8_next_char (*text);
116 }
117
118
119
120 static void
121 str_utf8_cprev_char (const char **text)
122 {
123 (*text) = g_utf8_prev_char (*text);
124 }
125
126
127
128 static void
129 str_utf8_cnext_char_safe (const char **text)
130 {
131 if (str_utf8_is_valid_char (*text, -1) == 1)
132 (*text) = g_utf8_next_char (*text);
133 else
134 (*text)++;
135 }
136
137
138
139 static void
140 str_utf8_cprev_char_safe (const char **text)
141 {
142 const char *result, *t;
143
144 result = g_utf8_prev_char (*text);
145 t = result;
146 str_utf8_cnext_char_safe (&t);
147 if (t == *text)
148 (*text) = result;
149 else
150 (*text)--;
151 }
152
153
154
155 static void
156 str_utf8_fix_string (char *text)
157 {
158 while (text[0] != '\0')
159 {
160 gunichar uni;
161
162 uni = g_utf8_get_char_validated (text, -1);
163 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
164 text = g_utf8_next_char (text);
165 else
166 {
167 text[0] = '?';
168 text++;
169 }
170 }
171 }
172
173
174
175 static gboolean
176 str_utf8_isspace (const char *text)
177 {
178 gunichar uni;
179
180 uni = g_utf8_get_char_validated (text, -1);
181 return g_unichar_isspace (uni);
182 }
183
184
185
186 static gboolean
187 str_utf8_ispunct (const char *text)
188 {
189 gunichar uni;
190
191 uni = g_utf8_get_char_validated (text, -1);
192 return g_unichar_ispunct (uni);
193 }
194
195
196
197 static gboolean
198 str_utf8_isalnum (const char *text)
199 {
200 gunichar uni;
201
202 uni = g_utf8_get_char_validated (text, -1);
203 return g_unichar_isalnum (uni);
204 }
205
206
207
208 static gboolean
209 str_utf8_isdigit (const char *text)
210 {
211 gunichar uni;
212
213 uni = g_utf8_get_char_validated (text, -1);
214 return g_unichar_isdigit (uni);
215 }
216
217
218
219 static gboolean
220 str_utf8_isprint (const char *ch)
221 {
222 gunichar uni;
223
224 uni = g_utf8_get_char_validated (ch, -1);
225 return g_unichar_isprint (uni);
226 }
227
228
229
230 static gboolean
231 str_utf8_iscombiningmark (const char *ch)
232 {
233 gunichar uni;
234
235 uni = g_utf8_get_char_validated (ch, -1);
236 return str_unichar_iscombiningmark (uni);
237 }
238
239
240
241 static int
242 str_utf8_cnext_noncomb_char (const char **text)
243 {
244 int count = 0;
245
246 while ((*text)[0] != '\0')
247 {
248 str_utf8_cnext_char_safe (text);
249 count++;
250 if (!str_utf8_iscombiningmark (*text))
251 break;
252 }
253
254 return count;
255 }
256
257
258
259 static int
260 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
261 {
262 int count = 0;
263
264 while ((*text) != begin)
265 {
266 str_utf8_cprev_char_safe (text);
267 count++;
268 if (!str_utf8_iscombiningmark (*text))
269 break;
270 }
271
272 return count;
273 }
274
275
276
277 static gboolean
278 str_utf8_toupper (const char *text, char **out, size_t * remain)
279 {
280 gunichar uni;
281 size_t left;
282
283 uni = g_utf8_get_char_validated (text, -1);
284 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
285 return FALSE;
286
287 uni = g_unichar_toupper (uni);
288 left = g_unichar_to_utf8 (uni, NULL);
289 if (left >= *remain)
290 return FALSE;
291
292 left = g_unichar_to_utf8 (uni, *out);
293 (*out) += left;
294 (*remain) -= left;
295 return TRUE;
296 }
297
298
299
300 static gboolean
301 str_utf8_tolower (const char *text, char **out, size_t * remain)
302 {
303 gunichar uni;
304 size_t left;
305
306 uni = g_utf8_get_char_validated (text, -1);
307 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
308 return FALSE;
309
310 uni = g_unichar_tolower (uni);
311 left = g_unichar_to_utf8 (uni, NULL);
312 if (left >= *remain)
313 return FALSE;
314
315 left = g_unichar_to_utf8 (uni, *out);
316 (*out) += left;
317 (*remain) -= left;
318 return TRUE;
319 }
320
321
322
323 static int
324 str_utf8_length (const char *text)
325 {
326 int result = 0;
327 const char *start;
328 const char *end;
329
330 start = text;
331 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
332 {
333 if (start != end)
334 result += g_utf8_strlen (start, end - start);
335
336 result++;
337 start = end + 1;
338 }
339
340 if (start == text)
341 result = g_utf8_strlen (text, -1);
342 else if (start[0] != '\0' && start != end)
343 result += g_utf8_strlen (start, end - start);
344
345 return result;
346 }
347
348
349
350 static int
351 str_utf8_length2 (const char *text, int size)
352 {
353 int result = 0;
354 const char *start;
355 const char *end;
356
357 start = text;
358 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
359 {
360 if (start != end)
361 {
362 result += g_utf8_strlen (start, MIN (end - start, size));
363 size -= end - start;
364 }
365 result += (size > 0);
366 size--;
367 start = end + 1;
368 }
369
370 if (start == text)
371 result = g_utf8_strlen (text, size);
372 else if (start[0] != '\0' && start != end && size > 0)
373 result += g_utf8_strlen (start, MIN (end - start, size));
374
375 return result;
376 }
377
378
379
380 static int
381 str_utf8_length_noncomb (const char *text)
382 {
383 int result = 0;
384 const char *t = text;
385
386 while (t[0] != '\0')
387 {
388 str_utf8_cnext_noncomb_char (&t);
389 result++;
390 }
391
392 return result;
393 }
394
395
396
397 #if 0
398 static void
399 str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
400 {
401 char *next;
402
403 next = g_utf8_next_char (*string);
404 (*left) -= next - (*string);
405 (*string) = next;
406 g_string_append_c (buffer, '?');
407 }
408 #endif
409
410
411
412 static gchar *
413 str_utf8_conv_gerror_message (GError * mcerror, const char *def_msg)
414 {
415 if (mcerror != NULL)
416 return g_strdup (mcerror->message);
417
418 return g_strdup (def_msg != NULL ? def_msg : "");
419 }
420
421
422
423 static estr_t
424 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
425 {
426 estr_t result = ESTR_SUCCESS;
427
428 if (coder == str_cnv_not_convert)
429 g_string_append_len (buffer, string, size);
430 else
431 result = str_nconvert (coder, string, size, buffer);
432
433 return result;
434 }
435
436
437
438
439
440 static const struct term_form *
441 str_utf8_make_make_term_form (const char *text, size_t length)
442 {
443 static struct term_form result;
444 gunichar uni;
445 size_t left;
446 char *actual;
447
448 result.text[0] = '\0';
449 result.width = 0;
450 result.compose = FALSE;
451 actual = result.text;
452
453
454
455 if (length != 0 && text[0] != '\0')
456 {
457 uni = g_utf8_get_char_validated (text, -1);
458 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
459 && str_unichar_iscombiningmark (uni))
460 {
461 actual[0] = ' ';
462 actual++;
463 result.width++;
464 result.compose = TRUE;
465 }
466 }
467
468 while (length != 0 && text[0] != '\0')
469 {
470 uni = g_utf8_get_char_validated (text, -1);
471 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
472 {
473 if (g_unichar_isprint (uni))
474 {
475 left = g_unichar_to_utf8 (uni, actual);
476 actual += left;
477 if (str_unichar_iscombiningmark (uni))
478 result.compose = TRUE;
479 else
480 {
481 result.width++;
482 if (g_unichar_iswide (uni))
483 result.width++;
484 }
485 }
486 else
487 {
488 actual[0] = '.';
489 actual++;
490 result.width++;
491 }
492 text = g_utf8_next_char (text);
493 }
494 else
495 {
496 text++;
497
498 memcpy (actual, replch, strlen (replch));
499 actual += strlen (replch);
500 result.width++;
501 }
502
503 if (length != (size_t) (-1))
504 length--;
505 }
506 actual[0] = '\0';
507
508 return &result;
509 }
510
511
512
513 static const char *
514 str_utf8_term_form (const char *text)
515 {
516 static char result[BUF_MEDIUM * MB_LEN_MAX];
517 const struct term_form *pre_form;
518
519 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
520 if (pre_form->compose)
521 {
522 char *composed;
523
524 composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
525 g_strlcpy (result, composed, sizeof (result));
526 g_free (composed);
527 }
528 else
529 g_strlcpy (result, pre_form->text, sizeof (result));
530
531 return result;
532 }
533
534
535
536
537 static gboolean
538 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
539 {
540 tool->compose = FALSE;
541
542 while (tool->checked[0] != '\0')
543 {
544 gunichar uni;
545 size_t left;
546
547 uni = g_utf8_get_char (tool->checked);
548 tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
549 left = g_unichar_to_utf8 (uni, NULL);
550 if (tool->remain <= left)
551 return FALSE;
552 left = g_unichar_to_utf8 (uni, tool->actual);
553 tool->actual += left;
554 tool->remain -= left;
555 tool->checked = g_utf8_next_char (tool->checked);
556 }
557
558 return TRUE;
559 }
560
561
562
563
564
565 static gboolean
566 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
567 {
568 tool->compose = FALSE;
569
570 while (tool->checked[0] != '\0')
571 {
572 gunichar uni;
573 size_t left;
574 int w = 0;
575
576 uni = g_utf8_get_char (tool->checked);
577 if (str_unichar_iscombiningmark (uni))
578 tool->compose = TRUE;
579 else
580 {
581 w = 1;
582 if (g_unichar_iswide (uni))
583 w++;
584 if (tool->ident + w > to_ident)
585 return TRUE;
586 }
587
588 left = g_unichar_to_utf8 (uni, NULL);
589 if (tool->remain <= left)
590 return FALSE;
591 left = g_unichar_to_utf8 (uni, tool->actual);
592 tool->actual += left;
593 tool->remain -= left;
594 tool->checked = g_utf8_next_char (tool->checked);
595 tool->ident += w;
596 }
597
598 return TRUE;
599 }
600
601
602
603
604 static int
605 utf8_tool_insert_space (struct utf8_tool *tool, int count)
606 {
607 if (count <= 0)
608 return 1;
609 if (tool->remain <= (gsize) count)
610 return 0;
611
612 memset (tool->actual, ' ', count);
613 tool->actual += count;
614 tool->remain -= count;
615 return 1;
616 }
617
618
619
620
621 static int
622 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
623 {
624 if (tool->remain <= 1)
625 return 0;
626
627 tool->actual[0] = ch;
628 tool->actual++;
629 tool->remain--;
630 return 1;
631 }
632
633
634
635
636
637 static gboolean
638 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
639 {
640 gunichar uni;
641
642 while (to_ident > tool->ident && tool->checked[0] != '\0')
643 {
644 uni = g_utf8_get_char (tool->checked);
645 if (!str_unichar_iscombiningmark (uni))
646 {
647 tool->ident++;
648 if (g_unichar_iswide (uni))
649 tool->ident++;
650 }
651 tool->checked = g_utf8_next_char (tool->checked);
652 }
653
654 uni = g_utf8_get_char (tool->checked);
655 while (str_unichar_iscombiningmark (uni))
656 {
657 tool->checked = g_utf8_next_char (tool->checked);
658 uni = g_utf8_get_char (tool->checked);
659 }
660
661 return TRUE;
662 }
663
664
665
666 static void
667 utf8_tool_compose (char *buffer, size_t size)
668 {
669 char *composed;
670
671 composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
672 g_strlcpy (buffer, composed, size);
673 g_free (composed);
674 }
675
676
677
678 static const char *
679 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
680 {
681 static char result[BUF_MEDIUM * MB_LEN_MAX];
682 const struct term_form *pre_form;
683 struct utf8_tool tool;
684
685 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
686 tool.checked = pre_form->text;
687 tool.actual = result;
688 tool.remain = sizeof (result);
689 tool.compose = FALSE;
690
691 if (pre_form->width <= (gsize) width)
692 {
693 switch (HIDE_FIT (just_mode))
694 {
695 case J_CENTER_LEFT:
696 case J_CENTER:
697 tool.ident = (width - pre_form->width) / 2;
698 break;
699 case J_RIGHT:
700 tool.ident = width - pre_form->width;
701 break;
702 default:
703 tool.ident = 0;
704 break;
705 }
706
707 utf8_tool_insert_space (&tool, tool.ident);
708 utf8_tool_copy_chars_to_end (&tool);
709 utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
710 }
711 else if (IS_FIT (just_mode))
712 {
713 tool.ident = 0;
714 utf8_tool_copy_chars_to (&tool, width / 2);
715 utf8_tool_insert_char (&tool, '~');
716
717 tool.ident = 0;
718 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
719 utf8_tool_copy_chars_to_end (&tool);
720 utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
721 }
722 else
723 {
724 switch (HIDE_FIT (just_mode))
725 {
726 case J_CENTER:
727 tool.ident = (width - pre_form->width) / 2;
728 break;
729 case J_RIGHT:
730 tool.ident = width - pre_form->width;
731 break;
732 default:
733 tool.ident = 0;
734 break;
735 }
736
737 utf8_tool_skip_chars_to (&tool, 0);
738 utf8_tool_insert_space (&tool, tool.ident);
739 utf8_tool_copy_chars_to (&tool, width);
740 utf8_tool_insert_space (&tool, width - tool.ident);
741 }
742
743 tool.actual[0] = '\0';
744 if (tool.compose)
745 utf8_tool_compose (result, sizeof (result));
746 return result;
747 }
748
749
750
751 static const char *
752 str_utf8_term_trim (const char *text, int width)
753 {
754 static char result[BUF_MEDIUM * MB_LEN_MAX];
755 const struct term_form *pre_form;
756 struct utf8_tool tool;
757
758 if (width < 1)
759 {
760 result[0] = '\0';
761 return result;
762 }
763
764 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
765
766 tool.checked = pre_form->text;
767 tool.actual = result;
768 tool.remain = sizeof (result);
769 tool.compose = FALSE;
770
771 if ((gsize) width >= pre_form->width)
772 utf8_tool_copy_chars_to_end (&tool);
773 else if (width <= 3)
774 {
775 memset (tool.actual, '.', width);
776 tool.actual += width;
777 tool.remain -= width;
778 }
779 else
780 {
781 memset (tool.actual, '.', 3);
782 tool.actual += 3;
783 tool.remain -= 3;
784
785 tool.ident = 0;
786 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
787 utf8_tool_copy_chars_to_end (&tool);
788 }
789
790 tool.actual[0] = '\0';
791 if (tool.compose)
792 utf8_tool_compose (result, sizeof (result));
793 return result;
794 }
795
796
797
798 static int
799 str_utf8_term_width2 (const char *text, size_t length)
800 {
801 const struct term_form *result;
802
803 result = str_utf8_make_make_term_form (text, length);
804 return result->width;
805 }
806
807
808
809 static int
810 str_utf8_term_width1 (const char *text)
811 {
812 return str_utf8_term_width2 (text, (size_t) (-1));
813 }
814
815
816
817 static int
818 str_utf8_term_char_width (const char *text)
819 {
820 gunichar uni;
821
822 uni = g_utf8_get_char_validated (text, -1);
823 return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
824 }
825
826
827
828 static const char *
829 str_utf8_term_substring (const char *text, int start, int width)
830 {
831 static char result[BUF_MEDIUM * MB_LEN_MAX];
832 const struct term_form *pre_form;
833 struct utf8_tool tool;
834
835 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
836
837 tool.checked = pre_form->text;
838 tool.actual = result;
839 tool.remain = sizeof (result);
840 tool.compose = FALSE;
841
842 tool.ident = -start;
843 utf8_tool_skip_chars_to (&tool, 0);
844 if (tool.ident < 0)
845 tool.ident = 0;
846 utf8_tool_insert_space (&tool, tool.ident);
847
848 utf8_tool_copy_chars_to (&tool, width);
849 utf8_tool_insert_space (&tool, width - tool.ident);
850
851 tool.actual[0] = '\0';
852 if (tool.compose)
853 utf8_tool_compose (result, sizeof (result));
854 return result;
855 }
856
857
858
859 static const char *
860 str_utf8_trunc (const char *text, int width)
861 {
862 static char result[MC_MAXPATHLEN * MB_LEN_MAX * 2];
863 const struct term_form *pre_form;
864 struct utf8_tool tool;
865
866 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
867
868 tool.checked = pre_form->text;
869 tool.actual = result;
870 tool.remain = sizeof (result);
871 tool.compose = FALSE;
872
873 if (pre_form->width <= (gsize) width)
874 utf8_tool_copy_chars_to_end (&tool);
875 else
876 {
877 tool.ident = 0;
878 utf8_tool_copy_chars_to (&tool, width / 2);
879 utf8_tool_insert_char (&tool, '~');
880
881 tool.ident = 0;
882 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
883 utf8_tool_copy_chars_to_end (&tool);
884 }
885
886 tool.actual[0] = '\0';
887 if (tool.compose)
888 utf8_tool_compose (result, sizeof (result));
889 return result;
890 }
891
892
893
894 static int
895 str_utf8_offset_to_pos (const char *text, size_t length)
896 {
897 if (str_utf8_is_valid_string (text))
898 return g_utf8_offset_to_pointer (text, length) - text;
899 else
900 {
901 int result;
902 GString *buffer;
903
904 buffer = g_string_new (text);
905 str_utf8_fix_string (buffer->str);
906 result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
907 g_string_free (buffer, TRUE);
908 return result;
909 }
910 }
911
912
913
914 static int
915 str_utf8_column_to_pos (const char *text, size_t pos)
916 {
917 int result = 0;
918 int width = 0;
919
920 while (text[0] != '\0')
921 {
922 gunichar uni;
923
924 uni = g_utf8_get_char_validated (text, MB_LEN_MAX);
925 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
926 {
927 if (g_unichar_isprint (uni))
928 {
929 if (!str_unichar_iscombiningmark (uni))
930 {
931 width++;
932 if (g_unichar_iswide (uni))
933 width++;
934 }
935 }
936 else
937 {
938 width++;
939 }
940 text = g_utf8_next_char (text);
941 }
942 else
943 {
944 text++;
945 width++;
946 }
947
948 if ((gsize) width > pos)
949 return result;
950
951 result++;
952 }
953
954 return result;
955 }
956
957
958
959 static char *
960 str_utf8_create_search_needle (const char *needle, gboolean case_sen)
961 {
962 char *fold, *result;
963
964 if (needle == NULL)
965 return NULL;
966
967 if (case_sen)
968 return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
969
970 fold = g_utf8_casefold (needle, -1);
971 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
972 g_free (fold);
973 return result;
974 }
975
976
977
978 static void
979 str_utf8_release_search_needle (char *needle, gboolean case_sen)
980 {
981 (void) case_sen;
982 g_free (needle);
983 }
984
985
986
987 static const char *
988 str_utf8_search_first (const char *text, const char *search, gboolean case_sen)
989 {
990 char *fold_text;
991 char *deco_text;
992 const char *match;
993 const char *result = NULL;
994 const char *m;
995
996 fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
997 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
998
999 match = deco_text;
1000 do
1001 {
1002 match = g_strstr_len (match, -1, search);
1003 if (match != NULL)
1004 {
1005 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1006 !str_utf8_iscombiningmark (match + strlen (search)))
1007 {
1008 result = text;
1009 m = deco_text;
1010 while (m < match)
1011 {
1012 str_utf8_cnext_noncomb_char (&m);
1013 str_utf8_cnext_noncomb_char (&result);
1014 }
1015 }
1016 else
1017 str_utf8_cnext_char (&match);
1018 }
1019 }
1020 while (match != NULL && result == NULL);
1021
1022 g_free (deco_text);
1023 if (!case_sen)
1024 g_free (fold_text);
1025
1026 return result;
1027 }
1028
1029
1030
1031 static const char *
1032 str_utf8_search_last (const char *text, const char *search, gboolean case_sen)
1033 {
1034 char *fold_text;
1035 char *deco_text;
1036 char *match;
1037 const char *result = NULL;
1038 const char *m;
1039
1040 fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
1041 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1042
1043 do
1044 {
1045 match = g_strrstr_len (deco_text, -1, search);
1046 if (match != NULL)
1047 {
1048 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1049 !str_utf8_iscombiningmark (match + strlen (search)))
1050 {
1051 result = text;
1052 m = deco_text;
1053 while (m < match)
1054 {
1055 str_utf8_cnext_noncomb_char (&m);
1056 str_utf8_cnext_noncomb_char (&result);
1057 }
1058 }
1059 else
1060 match[0] = '\0';
1061 }
1062 }
1063 while (match != NULL && result == NULL);
1064
1065 g_free (deco_text);
1066 if (!case_sen)
1067 g_free (fold_text);
1068
1069 return result;
1070 }
1071
1072
1073
1074 static char *
1075 str_utf8_normalize (const char *text)
1076 {
1077 GString *fixed;
1078 char *tmp;
1079 char *result;
1080 const char *start;
1081 const char *end;
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091 for (end = text; *end != '\0'; end++)
1092 if ((*end & 0x80) != 0)
1093 {
1094
1095 break;
1096 }
1097
1098
1099 if (*end == '\0')
1100 return g_strndup (text, end - text);
1101
1102 fixed = g_string_sized_new (4);
1103
1104 start = text;
1105 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1106 {
1107 if (start != end)
1108 {
1109 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1110 g_string_append (fixed, tmp);
1111 g_free (tmp);
1112 }
1113 g_string_append_c (fixed, end[0]);
1114 start = end + 1;
1115 }
1116
1117 if (start == text)
1118 {
1119 result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1120 g_string_free (fixed, TRUE);
1121 }
1122 else
1123 {
1124 if (start[0] != '\0' && start != end)
1125 {
1126 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1127 g_string_append (fixed, tmp);
1128 g_free (tmp);
1129 }
1130 result = g_string_free (fixed, FALSE);
1131 }
1132
1133 return result;
1134 }
1135
1136
1137
1138 static char *
1139 str_utf8_casefold_normalize (const char *text)
1140 {
1141 GString *fixed;
1142 char *tmp, *fold;
1143 char *result;
1144 const char *start;
1145 const char *end;
1146
1147 fixed = g_string_sized_new (4);
1148
1149 start = text;
1150 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1151 {
1152 if (start != end)
1153 {
1154 fold = g_utf8_casefold (start, end - start);
1155 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1156 g_string_append (fixed, tmp);
1157 g_free (tmp);
1158 g_free (fold);
1159 }
1160 g_string_append_c (fixed, end[0]);
1161 start = end + 1;
1162 }
1163
1164 if (start == text)
1165 {
1166 fold = g_utf8_casefold (text, -1);
1167 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1168 g_free (fold);
1169 g_string_free (fixed, TRUE);
1170 }
1171 else
1172 {
1173 if (start[0] != '\0' && start != end)
1174 {
1175 fold = g_utf8_casefold (start, end - start);
1176 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1177 g_string_append (fixed, tmp);
1178 g_free (tmp);
1179 g_free (fold);
1180 }
1181 result = g_string_free (fixed, FALSE);
1182 }
1183
1184 return result;
1185 }
1186
1187
1188
1189 static int
1190 str_utf8_compare (const char *t1, const char *t2)
1191 {
1192 char *n1, *n2;
1193 int result;
1194
1195 n1 = str_utf8_normalize (t1);
1196 n2 = str_utf8_normalize (t2);
1197
1198 result = strcmp (n1, n2);
1199
1200 g_free (n1);
1201 g_free (n2);
1202
1203 return result;
1204 }
1205
1206
1207
1208 static int
1209 str_utf8_ncompare (const char *t1, const char *t2)
1210 {
1211 char *n1, *n2;
1212 size_t l1, l2;
1213 int result;
1214
1215 n1 = str_utf8_normalize (t1);
1216 n2 = str_utf8_normalize (t2);
1217
1218 l1 = strlen (n1);
1219 l2 = strlen (n2);
1220 result = strncmp (n1, n2, MIN (l1, l2));
1221
1222 g_free (n1);
1223 g_free (n2);
1224
1225 return result;
1226 }
1227
1228
1229
1230 static int
1231 str_utf8_casecmp (const char *t1, const char *t2)
1232 {
1233 char *n1, *n2;
1234 int result;
1235
1236 n1 = str_utf8_casefold_normalize (t1);
1237 n2 = str_utf8_casefold_normalize (t2);
1238
1239 result = strcmp (n1, n2);
1240
1241 g_free (n1);
1242 g_free (n2);
1243
1244 return result;
1245 }
1246
1247
1248
1249 static int
1250 str_utf8_ncasecmp (const char *t1, const char *t2)
1251 {
1252 char *n1, *n2;
1253 size_t l1, l2;
1254 int result;
1255
1256 n1 = str_utf8_casefold_normalize (t1);
1257 n2 = str_utf8_casefold_normalize (t2);
1258
1259 l1 = strlen (n1);
1260 l2 = strlen (n2);
1261 result = strncmp (n1, n2, MIN (l1, l2));
1262
1263 g_free (n1);
1264 g_free (n2);
1265
1266 return result;
1267 }
1268
1269
1270
1271 static int
1272 str_utf8_prefix (const char *text, const char *prefix)
1273 {
1274 char *t, *p;
1275 const char *nt, *np;
1276 const char *nnt, *nnp;
1277 int result;
1278
1279 t = str_utf8_normalize (text);
1280 p = str_utf8_normalize (prefix);
1281 nt = t;
1282 np = p;
1283 nnt = t;
1284 nnp = p;
1285
1286 while (nt[0] != '\0' && np[0] != '\0')
1287 {
1288 str_utf8_cnext_char_safe (&nnt);
1289 str_utf8_cnext_char_safe (&nnp);
1290 if (nnt - nt != nnp - np)
1291 break;
1292 if (strncmp (nt, np, nnt - nt) != 0)
1293 break;
1294 nt = nnt;
1295 np = nnp;
1296 }
1297
1298 result = np - p;
1299
1300 g_free (t);
1301 g_free (p);
1302
1303 return result;
1304 }
1305
1306
1307
1308 static int
1309 str_utf8_caseprefix (const char *text, const char *prefix)
1310 {
1311 char *t, *p;
1312 const char *nt, *np;
1313 const char *nnt, *nnp;
1314 int result;
1315
1316 t = str_utf8_casefold_normalize (text);
1317 p = str_utf8_casefold_normalize (prefix);
1318 nt = t;
1319 np = p;
1320 nnt = t;
1321 nnp = p;
1322
1323 while (nt[0] != '\0' && np[0] != '\0')
1324 {
1325 str_utf8_cnext_char_safe (&nnt);
1326 str_utf8_cnext_char_safe (&nnp);
1327 if (nnt - nt != nnp - np)
1328 break;
1329 if (strncmp (nt, np, nnt - nt) != 0)
1330 break;
1331 nt = nnt;
1332 np = nnp;
1333 }
1334
1335 result = np - p;
1336
1337 g_free (t);
1338 g_free (p);
1339
1340 return result;
1341 }
1342
1343
1344
1345 static char *
1346 str_utf8_create_key_gen (const char *text, gboolean case_sen,
1347 gchar * (*keygen) (const gchar * text, gssize size))
1348 {
1349 char *result;
1350
1351 if (case_sen)
1352 result = str_utf8_normalize (text);
1353 else
1354 {
1355 gboolean dot;
1356 GString *fixed;
1357 const char *start, *end;
1358 char *fold, *key;
1359
1360 dot = text[0] == '.';
1361 fixed = g_string_sized_new (16);
1362
1363 if (!dot)
1364 start = text;
1365 else
1366 {
1367 start = text + 1;
1368 g_string_append_c (fixed, '.');
1369 }
1370
1371 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1372 {
1373 if (start != end)
1374 {
1375 fold = g_utf8_casefold (start, end - start);
1376 key = keygen (fold, -1);
1377 g_string_append (fixed, key);
1378 g_free (key);
1379 g_free (fold);
1380 }
1381 g_string_append_c (fixed, end[0]);
1382 start = end + 1;
1383 }
1384
1385 if (start == text)
1386 {
1387 fold = g_utf8_casefold (start, -1);
1388 result = keygen (fold, -1);
1389 g_free (fold);
1390 g_string_free (fixed, TRUE);
1391 }
1392 else if (dot && (start == text + 1))
1393 {
1394 fold = g_utf8_casefold (start, -1);
1395 key = keygen (fold, -1);
1396 g_string_append (fixed, key);
1397 g_free (key);
1398 g_free (fold);
1399 result = g_string_free (fixed, FALSE);
1400 }
1401 else
1402 {
1403 if (start[0] != '\0' && start != end)
1404 {
1405 fold = g_utf8_casefold (start, end - start);
1406 key = keygen (fold, -1);
1407 g_string_append (fixed, key);
1408 g_free (key);
1409 g_free (fold);
1410 }
1411 result = g_string_free (fixed, FALSE);
1412 }
1413 }
1414 return result;
1415 }
1416
1417
1418
1419 static char *
1420 str_utf8_create_key (const char *text, gboolean case_sen)
1421 {
1422 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1423 }
1424
1425
1426
1427 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1428 static char *
1429 str_utf8_create_key_for_filename (const char *text, gboolean case_sen)
1430 {
1431 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1432 }
1433 #endif
1434
1435
1436
1437 static int
1438 str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen)
1439 {
1440 (void) case_sen;
1441 return strcmp (t1, t2);
1442 }
1443
1444
1445
1446 static void
1447 str_utf8_release_key (char *key, gboolean case_sen)
1448 {
1449 (void) case_sen;
1450 g_free (key);
1451 }
1452
1453
1454
1455
1456
1457 struct str_class
1458 str_utf8_init (void)
1459 {
1460 struct str_class result;
1461
1462 result.conv_gerror_message = str_utf8_conv_gerror_message;
1463 result.vfs_convert_to = str_utf8_vfs_convert_to;
1464 result.insert_replace_char = str_utf8_insert_replace_char;
1465 result.is_valid_string = str_utf8_is_valid_string;
1466 result.is_valid_char = str_utf8_is_valid_char;
1467 result.cnext_char = str_utf8_cnext_char;
1468 result.cprev_char = str_utf8_cprev_char;
1469 result.cnext_char_safe = str_utf8_cnext_char_safe;
1470 result.cprev_char_safe = str_utf8_cprev_char_safe;
1471 result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1472 result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1473 result.char_isspace = str_utf8_isspace;
1474 result.char_ispunct = str_utf8_ispunct;
1475 result.char_isalnum = str_utf8_isalnum;
1476 result.char_isdigit = str_utf8_isdigit;
1477 result.char_isprint = str_utf8_isprint;
1478 result.char_iscombiningmark = str_utf8_iscombiningmark;
1479 result.char_toupper = str_utf8_toupper;
1480 result.char_tolower = str_utf8_tolower;
1481 result.length = str_utf8_length;
1482 result.length2 = str_utf8_length2;
1483 result.length_noncomb = str_utf8_length_noncomb;
1484 result.fix_string = str_utf8_fix_string;
1485 result.term_form = str_utf8_term_form;
1486 result.fit_to_term = str_utf8_fit_to_term;
1487 result.term_trim = str_utf8_term_trim;
1488 result.term_width2 = str_utf8_term_width2;
1489 result.term_width1 = str_utf8_term_width1;
1490 result.term_char_width = str_utf8_term_char_width;
1491 result.term_substring = str_utf8_term_substring;
1492 result.trunc = str_utf8_trunc;
1493 result.offset_to_pos = str_utf8_offset_to_pos;
1494 result.column_to_pos = str_utf8_column_to_pos;
1495 result.create_search_needle = str_utf8_create_search_needle;
1496 result.release_search_needle = str_utf8_release_search_needle;
1497 result.search_first = str_utf8_search_first;
1498 result.search_last = str_utf8_search_last;
1499 result.compare = str_utf8_compare;
1500 result.ncompare = str_utf8_ncompare;
1501 result.casecmp = str_utf8_casecmp;
1502 result.ncasecmp = str_utf8_ncasecmp;
1503 result.prefix = str_utf8_prefix;
1504 result.caseprefix = str_utf8_caseprefix;
1505 result.create_key = str_utf8_create_key;
1506 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1507
1508 result.create_key_for_filename = str_utf8_create_key_for_filename;
1509 #else
1510
1511 result.create_key_for_filename = str_utf8_create_key;
1512 #endif
1513 result.key_collate = str_utf8_key_collate;
1514 result.release_key = str_utf8_release_key;
1515
1516 return result;
1517 }
1518
1519