This source file includes following definitions.
- str_unichar_iscombiningmark
- str_utf8_insert_replace_char
- str_utf8_is_valid_string
- str_utf8_is_valid_char
- str_utf8_cnext_char
- str_utf8_cprev_char
- str_utf8_cnext_char_safe
- str_utf8_cprev_char_safe
- str_utf8_fix_string
- str_utf8_isspace
- str_utf8_ispunct
- str_utf8_isalnum
- str_utf8_isdigit
- str_utf8_isprint
- str_utf8_iscombiningmark
- str_utf8_cnext_noncomb_char
- str_utf8_cprev_noncomb_char
- str_utf8_toupper
- str_utf8_tolower
- str_utf8_length
- str_utf8_length2
- str_utf8_length_noncomb
- str_utf8_questmark_sustb
- str_utf8_conv_gerror_message
- str_utf8_vfs_convert_to
- str_utf8_make_make_term_form
- str_utf8_term_form
- utf8_tool_copy_chars_to_end
- utf8_tool_copy_chars_to
- utf8_tool_insert_space
- utf8_tool_insert_char
- utf8_tool_skip_chars_to
- utf8_tool_compose
- str_utf8_fit_to_term
- str_utf8_term_trim
- str_utf8_term_width2
- str_utf8_term_width1
- str_utf8_term_char_width
- str_utf8_term_substring
- str_utf8_trunc
- str_utf8_offset_to_pos
- str_utf8_column_to_pos
- str_utf8_create_search_needle
- str_utf8_release_search_needle
- str_utf8_search_first
- str_utf8_search_last
- str_utf8_normalize
- str_utf8_casefold_normalize
- str_utf8_compare
- str_utf8_ncompare
- str_utf8_casecmp
- str_utf8_ncasecmp
- str_utf8_prefix
- str_utf8_caseprefix
- str_utf8_create_key_gen
- str_utf8_create_key
- str_utf8_create_key_for_filename
- str_utf8_key_collate
- str_utf8_release_key
- str_utf8_init
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 #include <config.h>
27
28 #include <stdlib.h>
29 #include <langinfo.h>
30 #include <limits.h>
31 #include <string.h>
32
33 #include "lib/global.h"
34 #include "lib/strutil.h"
35
36
37
38
39
40
41
42
43
44 struct utf8_tool
45 {
46 char *actual;
47 size_t remain;
48 const char *checked;
49 int ident;
50 gboolean compose;
51 };
52
53 struct term_form
54 {
55 char text[BUF_MEDIUM * MB_LEN_MAX];
56 size_t width;
57 gboolean compose;
58 };
59
60
61
62
63
64 static const char replch[] = "\xEF\xBF\xBD";
65
66
67
68
69
70 static gboolean
71 str_unichar_iscombiningmark (gunichar uni)
72 {
73 GUnicodeType type;
74
75 type = g_unichar_type (uni);
76 return (type == G_UNICODE_SPACING_MARK)
77 || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
78 }
79
80
81
82 static void
83 str_utf8_insert_replace_char (GString * buffer)
84 {
85 g_string_append (buffer, replch);
86 }
87
88
89
90 static gboolean
91 str_utf8_is_valid_string (const char *text)
92 {
93 return g_utf8_validate (text, -1, NULL);
94 }
95
96
97
98 static int
99 str_utf8_is_valid_char (const char *ch, size_t size)
100 {
101 switch (g_utf8_get_char_validated (ch, size))
102 {
103 case (gunichar) (-2):
104 return (-2);
105 case (gunichar) (-1):
106 return (-1);
107 default:
108 return 1;
109 }
110 }
111
112
113
114 static void
115 str_utf8_cnext_char (const char **text)
116 {
117 (*text) = g_utf8_next_char (*text);
118 }
119
120
121
122 static void
123 str_utf8_cprev_char (const char **text)
124 {
125 (*text) = g_utf8_prev_char (*text);
126 }
127
128
129
130 static void
131 str_utf8_cnext_char_safe (const char **text)
132 {
133 if (str_utf8_is_valid_char (*text, -1) == 1)
134 (*text) = g_utf8_next_char (*text);
135 else
136 (*text)++;
137 }
138
139
140
141 static void
142 str_utf8_cprev_char_safe (const char **text)
143 {
144 const char *result, *t;
145
146 result = g_utf8_prev_char (*text);
147 t = result;
148 str_utf8_cnext_char_safe (&t);
149 if (t == *text)
150 (*text) = result;
151 else
152 (*text)--;
153 }
154
155
156
157 static void
158 str_utf8_fix_string (char *text)
159 {
160 while (text[0] != '\0')
161 {
162 gunichar uni;
163
164 uni = g_utf8_get_char_validated (text, -1);
165 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
166 text = g_utf8_next_char (text);
167 else
168 {
169 text[0] = '?';
170 text++;
171 }
172 }
173 }
174
175
176
177 static gboolean
178 str_utf8_isspace (const char *text)
179 {
180 gunichar uni;
181
182 uni = g_utf8_get_char_validated (text, -1);
183 return g_unichar_isspace (uni);
184 }
185
186
187
188 static gboolean
189 str_utf8_ispunct (const char *text)
190 {
191 gunichar uni;
192
193 uni = g_utf8_get_char_validated (text, -1);
194 return g_unichar_ispunct (uni);
195 }
196
197
198
199 static gboolean
200 str_utf8_isalnum (const char *text)
201 {
202 gunichar uni;
203
204 uni = g_utf8_get_char_validated (text, -1);
205 return g_unichar_isalnum (uni);
206 }
207
208
209
210 static gboolean
211 str_utf8_isdigit (const char *text)
212 {
213 gunichar uni;
214
215 uni = g_utf8_get_char_validated (text, -1);
216 return g_unichar_isdigit (uni);
217 }
218
219
220
221 static gboolean
222 str_utf8_isprint (const char *ch)
223 {
224 gunichar uni;
225
226 uni = g_utf8_get_char_validated (ch, -1);
227 return g_unichar_isprint (uni);
228 }
229
230
231
232 static gboolean
233 str_utf8_iscombiningmark (const char *ch)
234 {
235 gunichar uni;
236
237 uni = g_utf8_get_char_validated (ch, -1);
238 return str_unichar_iscombiningmark (uni);
239 }
240
241
242
243 static int
244 str_utf8_cnext_noncomb_char (const char **text)
245 {
246 int count = 0;
247
248 while ((*text)[0] != '\0')
249 {
250 str_utf8_cnext_char_safe (text);
251 count++;
252 if (!str_utf8_iscombiningmark (*text))
253 break;
254 }
255
256 return count;
257 }
258
259
260
261 static int
262 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
263 {
264 int count = 0;
265
266 while ((*text) != begin)
267 {
268 str_utf8_cprev_char_safe (text);
269 count++;
270 if (!str_utf8_iscombiningmark (*text))
271 break;
272 }
273
274 return count;
275 }
276
277
278
279 static gboolean
280 str_utf8_toupper (const char *text, char **out, size_t * remain)
281 {
282 gunichar uni;
283 size_t left;
284
285 uni = g_utf8_get_char_validated (text, -1);
286 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
287 return FALSE;
288
289 uni = g_unichar_toupper (uni);
290 left = g_unichar_to_utf8 (uni, NULL);
291 if (left >= *remain)
292 return FALSE;
293
294 left = g_unichar_to_utf8 (uni, *out);
295 (*out) += left;
296 (*remain) -= left;
297 return TRUE;
298 }
299
300
301
302 static gboolean
303 str_utf8_tolower (const char *text, char **out, size_t * remain)
304 {
305 gunichar uni;
306 size_t left;
307
308 uni = g_utf8_get_char_validated (text, -1);
309 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
310 return FALSE;
311
312 uni = g_unichar_tolower (uni);
313 left = g_unichar_to_utf8 (uni, NULL);
314 if (left >= *remain)
315 return FALSE;
316
317 left = g_unichar_to_utf8 (uni, *out);
318 (*out) += left;
319 (*remain) -= left;
320 return TRUE;
321 }
322
323
324
325 static int
326 str_utf8_length (const char *text)
327 {
328 int result = 0;
329 const char *start;
330 const char *end;
331
332 start = text;
333 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
334 {
335 if (start != end)
336 result += g_utf8_strlen (start, end - start);
337
338 result++;
339 start = end + 1;
340 }
341
342 if (start == text)
343 result = g_utf8_strlen (text, -1);
344 else if (start[0] != '\0' && start != end)
345 result += g_utf8_strlen (start, end - start);
346
347 return result;
348 }
349
350
351
352 static int
353 str_utf8_length2 (const char *text, int size)
354 {
355 int result = 0;
356 const char *start;
357 const char *end;
358
359 start = text;
360 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
361 {
362 if (start != end)
363 {
364 result += g_utf8_strlen (start, MIN (end - start, size));
365 size -= end - start;
366 }
367 result += (size > 0);
368 size--;
369 start = end + 1;
370 }
371
372 if (start == text)
373 result = g_utf8_strlen (text, size);
374 else if (start[0] != '\0' && start != end && size > 0)
375 result += g_utf8_strlen (start, MIN (end - start, size));
376
377 return result;
378 }
379
380
381
382 static int
383 str_utf8_length_noncomb (const char *text)
384 {
385 int result = 0;
386 const char *t = text;
387
388 while (t[0] != '\0')
389 {
390 str_utf8_cnext_noncomb_char (&t);
391 result++;
392 }
393
394 return result;
395 }
396
397
398
399 #if 0
400 static void
401 str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
402 {
403 char *next;
404
405 next = g_utf8_next_char (*string);
406 (*left) -= next - (*string);
407 (*string) = next;
408 g_string_append_c (buffer, '?');
409 }
410 #endif
411
412
413
414 static gchar *
415 str_utf8_conv_gerror_message (GError * mcerror, const char *def_msg)
416 {
417 if (mcerror != NULL)
418 return g_strdup (mcerror->message);
419
420 return g_strdup (def_msg != NULL ? def_msg : "");
421 }
422
423
424
425 static estr_t
426 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
427 {
428 estr_t result = ESTR_SUCCESS;
429
430 if (coder == str_cnv_not_convert)
431 g_string_append_len (buffer, string, size);
432 else
433 result = str_nconvert (coder, string, size, buffer);
434
435 return result;
436 }
437
438
439
440
441
442 static const struct term_form *
443 str_utf8_make_make_term_form (const char *text, size_t length)
444 {
445 static struct term_form result;
446 gunichar uni;
447 size_t left;
448 char *actual;
449
450 result.text[0] = '\0';
451 result.width = 0;
452 result.compose = FALSE;
453 actual = result.text;
454
455
456
457 if (length != 0 && text[0] != '\0')
458 {
459 uni = g_utf8_get_char_validated (text, -1);
460 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
461 && str_unichar_iscombiningmark (uni))
462 {
463 actual[0] = ' ';
464 actual++;
465 result.width++;
466 result.compose = TRUE;
467 }
468 }
469
470 while (length != 0 && text[0] != '\0')
471 {
472 uni = g_utf8_get_char_validated (text, -1);
473 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
474 {
475 if (g_unichar_isprint (uni))
476 {
477 left = g_unichar_to_utf8 (uni, actual);
478 actual += left;
479 if (str_unichar_iscombiningmark (uni))
480 result.compose = TRUE;
481 else
482 {
483 result.width++;
484 if (g_unichar_iswide (uni))
485 result.width++;
486 }
487 }
488 else
489 {
490 actual[0] = '.';
491 actual++;
492 result.width++;
493 }
494 text = g_utf8_next_char (text);
495 }
496 else
497 {
498 text++;
499
500 memcpy (actual, replch, strlen (replch));
501 actual += strlen (replch);
502 result.width++;
503 }
504
505 if (length != (size_t) (-1))
506 length--;
507 }
508 actual[0] = '\0';
509
510 return &result;
511 }
512
513
514
515 static const char *
516 str_utf8_term_form (const char *text)
517 {
518 static char result[BUF_MEDIUM * MB_LEN_MAX];
519 const struct term_form *pre_form;
520
521 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
522 if (pre_form->compose)
523 {
524 char *composed;
525
526 composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
527 g_strlcpy (result, composed, sizeof (result));
528 g_free (composed);
529 }
530 else
531 g_strlcpy (result, pre_form->text, sizeof (result));
532
533 return result;
534 }
535
536
537
538
539 static gboolean
540 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
541 {
542 tool->compose = FALSE;
543
544 while (tool->checked[0] != '\0')
545 {
546 gunichar uni;
547 size_t left;
548
549 uni = g_utf8_get_char (tool->checked);
550 tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
551 left = g_unichar_to_utf8 (uni, NULL);
552 if (tool->remain <= left)
553 return FALSE;
554 left = g_unichar_to_utf8 (uni, tool->actual);
555 tool->actual += left;
556 tool->remain -= left;
557 tool->checked = g_utf8_next_char (tool->checked);
558 }
559
560 return TRUE;
561 }
562
563
564
565
566
567 static gboolean
568 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
569 {
570 tool->compose = FALSE;
571
572 while (tool->checked[0] != '\0')
573 {
574 gunichar uni;
575 size_t left;
576 int w = 0;
577
578 uni = g_utf8_get_char (tool->checked);
579 if (str_unichar_iscombiningmark (uni))
580 tool->compose = TRUE;
581 else
582 {
583 w = 1;
584 if (g_unichar_iswide (uni))
585 w++;
586 if (tool->ident + w > to_ident)
587 return TRUE;
588 }
589
590 left = g_unichar_to_utf8 (uni, NULL);
591 if (tool->remain <= left)
592 return FALSE;
593 left = g_unichar_to_utf8 (uni, tool->actual);
594 tool->actual += left;
595 tool->remain -= left;
596 tool->checked = g_utf8_next_char (tool->checked);
597 tool->ident += w;
598 }
599
600 return TRUE;
601 }
602
603
604
605
606 static int
607 utf8_tool_insert_space (struct utf8_tool *tool, int count)
608 {
609 if (count <= 0)
610 return 1;
611 if (tool->remain <= (gsize) count)
612 return 0;
613
614 memset (tool->actual, ' ', count);
615 tool->actual += count;
616 tool->remain -= count;
617 return 1;
618 }
619
620
621
622
623 static int
624 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
625 {
626 if (tool->remain <= 1)
627 return 0;
628
629 tool->actual[0] = ch;
630 tool->actual++;
631 tool->remain--;
632 return 1;
633 }
634
635
636
637
638
639 static gboolean
640 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
641 {
642 gunichar uni;
643
644 while (to_ident > tool->ident && tool->checked[0] != '\0')
645 {
646 uni = g_utf8_get_char (tool->checked);
647 if (!str_unichar_iscombiningmark (uni))
648 {
649 tool->ident++;
650 if (g_unichar_iswide (uni))
651 tool->ident++;
652 }
653 tool->checked = g_utf8_next_char (tool->checked);
654 }
655
656 uni = g_utf8_get_char (tool->checked);
657 while (str_unichar_iscombiningmark (uni))
658 {
659 tool->checked = g_utf8_next_char (tool->checked);
660 uni = g_utf8_get_char (tool->checked);
661 }
662
663 return TRUE;
664 }
665
666
667
668 static void
669 utf8_tool_compose (char *buffer, size_t size)
670 {
671 char *composed;
672
673 composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
674 g_strlcpy (buffer, composed, size);
675 g_free (composed);
676 }
677
678
679
680 static const char *
681 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
682 {
683 static char result[BUF_MEDIUM * MB_LEN_MAX];
684 const struct term_form *pre_form;
685 struct utf8_tool tool;
686
687 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
688 tool.checked = pre_form->text;
689 tool.actual = result;
690 tool.remain = sizeof (result);
691 tool.compose = FALSE;
692
693 if (pre_form->width <= (gsize) width)
694 {
695 switch (HIDE_FIT (just_mode))
696 {
697 case J_CENTER_LEFT:
698 case J_CENTER:
699 tool.ident = (width - pre_form->width) / 2;
700 break;
701 case J_RIGHT:
702 tool.ident = width - pre_form->width;
703 break;
704 default:
705 tool.ident = 0;
706 break;
707 }
708
709 utf8_tool_insert_space (&tool, tool.ident);
710 utf8_tool_copy_chars_to_end (&tool);
711 utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
712 }
713 else if (IS_FIT (just_mode))
714 {
715 tool.ident = 0;
716 utf8_tool_copy_chars_to (&tool, width / 2);
717 utf8_tool_insert_char (&tool, '~');
718
719 tool.ident = 0;
720 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
721 utf8_tool_copy_chars_to_end (&tool);
722 utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
723 }
724 else
725 {
726 switch (HIDE_FIT (just_mode))
727 {
728 case J_CENTER:
729 tool.ident = (width - pre_form->width) / 2;
730 break;
731 case J_RIGHT:
732 tool.ident = width - pre_form->width;
733 break;
734 default:
735 tool.ident = 0;
736 break;
737 }
738
739 utf8_tool_skip_chars_to (&tool, 0);
740 utf8_tool_insert_space (&tool, tool.ident);
741 utf8_tool_copy_chars_to (&tool, width);
742 utf8_tool_insert_space (&tool, width - tool.ident);
743 }
744
745 tool.actual[0] = '\0';
746 if (tool.compose)
747 utf8_tool_compose (result, sizeof (result));
748 return result;
749 }
750
751
752
753 static const char *
754 str_utf8_term_trim (const char *text, int width)
755 {
756 static char result[BUF_MEDIUM * MB_LEN_MAX];
757 const struct term_form *pre_form;
758 struct utf8_tool tool;
759
760 if (width < 1)
761 {
762 result[0] = '\0';
763 return result;
764 }
765
766 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
767
768 tool.checked = pre_form->text;
769 tool.actual = result;
770 tool.remain = sizeof (result);
771 tool.compose = FALSE;
772
773 if ((gsize) width >= pre_form->width)
774 utf8_tool_copy_chars_to_end (&tool);
775 else if (width <= 3)
776 {
777 memset (tool.actual, '.', width);
778 tool.actual += width;
779 tool.remain -= width;
780 }
781 else
782 {
783 memset (tool.actual, '.', 3);
784 tool.actual += 3;
785 tool.remain -= 3;
786
787 tool.ident = 0;
788 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
789 utf8_tool_copy_chars_to_end (&tool);
790 }
791
792 tool.actual[0] = '\0';
793 if (tool.compose)
794 utf8_tool_compose (result, sizeof (result));
795 return result;
796 }
797
798
799
800 static int
801 str_utf8_term_width2 (const char *text, size_t length)
802 {
803 const struct term_form *result;
804
805 result = str_utf8_make_make_term_form (text, length);
806 return result->width;
807 }
808
809
810
811 static int
812 str_utf8_term_width1 (const char *text)
813 {
814 return str_utf8_term_width2 (text, (size_t) (-1));
815 }
816
817
818
819 static int
820 str_utf8_term_char_width (const char *text)
821 {
822 gunichar uni;
823
824 uni = g_utf8_get_char_validated (text, -1);
825 return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
826 }
827
828
829
830 static const char *
831 str_utf8_term_substring (const char *text, int start, int width)
832 {
833 static char result[BUF_MEDIUM * MB_LEN_MAX];
834 const struct term_form *pre_form;
835 struct utf8_tool tool;
836
837 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
838
839 tool.checked = pre_form->text;
840 tool.actual = result;
841 tool.remain = sizeof (result);
842 tool.compose = FALSE;
843
844 tool.ident = -start;
845 utf8_tool_skip_chars_to (&tool, 0);
846 if (tool.ident < 0)
847 tool.ident = 0;
848 utf8_tool_insert_space (&tool, tool.ident);
849
850 utf8_tool_copy_chars_to (&tool, width);
851 utf8_tool_insert_space (&tool, width - tool.ident);
852
853 tool.actual[0] = '\0';
854 if (tool.compose)
855 utf8_tool_compose (result, sizeof (result));
856 return result;
857 }
858
859
860
861 static const char *
862 str_utf8_trunc (const char *text, int width)
863 {
864 static char result[MC_MAXPATHLEN * MB_LEN_MAX * 2];
865 const struct term_form *pre_form;
866 struct utf8_tool tool;
867
868 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
869
870 tool.checked = pre_form->text;
871 tool.actual = result;
872 tool.remain = sizeof (result);
873 tool.compose = FALSE;
874
875 if (pre_form->width <= (gsize) width)
876 utf8_tool_copy_chars_to_end (&tool);
877 else
878 {
879 tool.ident = 0;
880 utf8_tool_copy_chars_to (&tool, width / 2);
881 utf8_tool_insert_char (&tool, '~');
882
883 tool.ident = 0;
884 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
885 utf8_tool_copy_chars_to_end (&tool);
886 }
887
888 tool.actual[0] = '\0';
889 if (tool.compose)
890 utf8_tool_compose (result, sizeof (result));
891 return result;
892 }
893
894
895
896 static int
897 str_utf8_offset_to_pos (const char *text, size_t length)
898 {
899 if (str_utf8_is_valid_string (text))
900 return g_utf8_offset_to_pointer (text, length) - text;
901 else
902 {
903 int result;
904 GString *buffer;
905
906 buffer = g_string_new (text);
907 str_utf8_fix_string (buffer->str);
908 result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
909 g_string_free (buffer, TRUE);
910 return result;
911 }
912 }
913
914
915
916 static int
917 str_utf8_column_to_pos (const char *text, size_t pos)
918 {
919 int result = 0;
920 int width = 0;
921
922 while (text[0] != '\0')
923 {
924 gunichar uni;
925
926 uni = g_utf8_get_char_validated (text, MB_LEN_MAX);
927 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
928 {
929 if (g_unichar_isprint (uni))
930 {
931 if (!str_unichar_iscombiningmark (uni))
932 {
933 width++;
934 if (g_unichar_iswide (uni))
935 width++;
936 }
937 }
938 else
939 {
940 width++;
941 }
942 text = g_utf8_next_char (text);
943 }
944 else
945 {
946 text++;
947 width++;
948 }
949
950 if ((gsize) width > pos)
951 return result;
952
953 result++;
954 }
955
956 return result;
957 }
958
959
960
961 static char *
962 str_utf8_create_search_needle (const char *needle, gboolean case_sen)
963 {
964 char *fold, *result;
965
966 if (needle == NULL)
967 return NULL;
968
969 if (case_sen)
970 return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
971
972 fold = g_utf8_casefold (needle, -1);
973 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
974 g_free (fold);
975 return result;
976 }
977
978
979
980 static void
981 str_utf8_release_search_needle (char *needle, gboolean case_sen)
982 {
983 (void) case_sen;
984 g_free (needle);
985 }
986
987
988
989 static const char *
990 str_utf8_search_first (const char *text, const char *search, gboolean case_sen)
991 {
992 char *fold_text;
993 char *deco_text;
994 const char *match;
995 const char *result = NULL;
996 const char *m;
997
998 fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
999 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1000
1001 match = deco_text;
1002 do
1003 {
1004 match = g_strstr_len (match, -1, search);
1005 if (match != NULL)
1006 {
1007 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1008 !str_utf8_iscombiningmark (match + strlen (search)))
1009 {
1010 result = text;
1011 m = deco_text;
1012 while (m < match)
1013 {
1014 str_utf8_cnext_noncomb_char (&m);
1015 str_utf8_cnext_noncomb_char (&result);
1016 }
1017 }
1018 else
1019 str_utf8_cnext_char (&match);
1020 }
1021 }
1022 while (match != NULL && result == NULL);
1023
1024 g_free (deco_text);
1025 if (!case_sen)
1026 g_free (fold_text);
1027
1028 return result;
1029 }
1030
1031
1032
1033 static const char *
1034 str_utf8_search_last (const char *text, const char *search, gboolean case_sen)
1035 {
1036 char *fold_text;
1037 char *deco_text;
1038 char *match;
1039 const char *result = NULL;
1040 const char *m;
1041
1042 fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
1043 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1044
1045 do
1046 {
1047 match = g_strrstr_len (deco_text, -1, search);
1048 if (match != NULL)
1049 {
1050 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1051 !str_utf8_iscombiningmark (match + strlen (search)))
1052 {
1053 result = text;
1054 m = deco_text;
1055 while (m < match)
1056 {
1057 str_utf8_cnext_noncomb_char (&m);
1058 str_utf8_cnext_noncomb_char (&result);
1059 }
1060 }
1061 else
1062 match[0] = '\0';
1063 }
1064 }
1065 while (match != NULL && result == NULL);
1066
1067 g_free (deco_text);
1068 if (!case_sen)
1069 g_free (fold_text);
1070
1071 return result;
1072 }
1073
1074
1075
1076 static char *
1077 str_utf8_normalize (const char *text)
1078 {
1079 GString *fixed;
1080 char *tmp;
1081 char *result;
1082 const char *start;
1083 const char *end;
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093 for (end = text; *end != '\0'; end++)
1094 if ((*end & 0x80) != 0)
1095 {
1096
1097 break;
1098 }
1099
1100
1101 if (*end == '\0')
1102 return g_strndup (text, end - text);
1103
1104 fixed = g_string_sized_new (4);
1105
1106 start = text;
1107 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1108 {
1109 if (start != end)
1110 {
1111 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1112 g_string_append (fixed, tmp);
1113 g_free (tmp);
1114 }
1115 g_string_append_c (fixed, end[0]);
1116 start = end + 1;
1117 }
1118
1119 if (start == text)
1120 {
1121 result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1122 g_string_free (fixed, TRUE);
1123 }
1124 else
1125 {
1126 if (start[0] != '\0' && start != end)
1127 {
1128 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1129 g_string_append (fixed, tmp);
1130 g_free (tmp);
1131 }
1132 result = g_string_free (fixed, FALSE);
1133 }
1134
1135 return result;
1136 }
1137
1138
1139
1140 static char *
1141 str_utf8_casefold_normalize (const char *text)
1142 {
1143 GString *fixed;
1144 char *tmp, *fold;
1145 char *result;
1146 const char *start;
1147 const char *end;
1148
1149 fixed = g_string_sized_new (4);
1150
1151 start = text;
1152 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1153 {
1154 if (start != end)
1155 {
1156 fold = g_utf8_casefold (start, end - start);
1157 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1158 g_string_append (fixed, tmp);
1159 g_free (tmp);
1160 g_free (fold);
1161 }
1162 g_string_append_c (fixed, end[0]);
1163 start = end + 1;
1164 }
1165
1166 if (start == text)
1167 {
1168 fold = g_utf8_casefold (text, -1);
1169 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1170 g_free (fold);
1171 g_string_free (fixed, TRUE);
1172 }
1173 else
1174 {
1175 if (start[0] != '\0' && start != end)
1176 {
1177 fold = g_utf8_casefold (start, end - start);
1178 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1179 g_string_append (fixed, tmp);
1180 g_free (tmp);
1181 g_free (fold);
1182 }
1183 result = g_string_free (fixed, FALSE);
1184 }
1185
1186 return result;
1187 }
1188
1189
1190
1191 static int
1192 str_utf8_compare (const char *t1, const char *t2)
1193 {
1194 char *n1, *n2;
1195 int result;
1196
1197 n1 = str_utf8_normalize (t1);
1198 n2 = str_utf8_normalize (t2);
1199
1200 result = strcmp (n1, n2);
1201
1202 g_free (n1);
1203 g_free (n2);
1204
1205 return result;
1206 }
1207
1208
1209
1210 static int
1211 str_utf8_ncompare (const char *t1, const char *t2)
1212 {
1213 char *n1, *n2;
1214 size_t l1, l2;
1215 int result;
1216
1217 n1 = str_utf8_normalize (t1);
1218 n2 = str_utf8_normalize (t2);
1219
1220 l1 = strlen (n1);
1221 l2 = strlen (n2);
1222 result = strncmp (n1, n2, MIN (l1, l2));
1223
1224 g_free (n1);
1225 g_free (n2);
1226
1227 return result;
1228 }
1229
1230
1231
1232 static int
1233 str_utf8_casecmp (const char *t1, const char *t2)
1234 {
1235 char *n1, *n2;
1236 int result;
1237
1238 n1 = str_utf8_casefold_normalize (t1);
1239 n2 = str_utf8_casefold_normalize (t2);
1240
1241 result = strcmp (n1, n2);
1242
1243 g_free (n1);
1244 g_free (n2);
1245
1246 return result;
1247 }
1248
1249
1250
1251 static int
1252 str_utf8_ncasecmp (const char *t1, const char *t2)
1253 {
1254 char *n1, *n2;
1255 size_t l1, l2;
1256 int result;
1257
1258 n1 = str_utf8_casefold_normalize (t1);
1259 n2 = str_utf8_casefold_normalize (t2);
1260
1261 l1 = strlen (n1);
1262 l2 = strlen (n2);
1263 result = strncmp (n1, n2, MIN (l1, l2));
1264
1265 g_free (n1);
1266 g_free (n2);
1267
1268 return result;
1269 }
1270
1271
1272
1273 static int
1274 str_utf8_prefix (const char *text, const char *prefix)
1275 {
1276 char *t, *p;
1277 const char *nt, *np;
1278 const char *nnt, *nnp;
1279 int result;
1280
1281 t = str_utf8_normalize (text);
1282 p = str_utf8_normalize (prefix);
1283 nt = t;
1284 np = p;
1285 nnt = t;
1286 nnp = p;
1287
1288 while (nt[0] != '\0' && np[0] != '\0')
1289 {
1290 str_utf8_cnext_char_safe (&nnt);
1291 str_utf8_cnext_char_safe (&nnp);
1292 if (nnt - nt != nnp - np)
1293 break;
1294 if (strncmp (nt, np, nnt - nt) != 0)
1295 break;
1296 nt = nnt;
1297 np = nnp;
1298 }
1299
1300 result = np - p;
1301
1302 g_free (t);
1303 g_free (p);
1304
1305 return result;
1306 }
1307
1308
1309
1310 static int
1311 str_utf8_caseprefix (const char *text, const char *prefix)
1312 {
1313 char *t, *p;
1314 const char *nt, *np;
1315 const char *nnt, *nnp;
1316 int result;
1317
1318 t = str_utf8_casefold_normalize (text);
1319 p = str_utf8_casefold_normalize (prefix);
1320 nt = t;
1321 np = p;
1322 nnt = t;
1323 nnp = p;
1324
1325 while (nt[0] != '\0' && np[0] != '\0')
1326 {
1327 str_utf8_cnext_char_safe (&nnt);
1328 str_utf8_cnext_char_safe (&nnp);
1329 if (nnt - nt != nnp - np)
1330 break;
1331 if (strncmp (nt, np, nnt - nt) != 0)
1332 break;
1333 nt = nnt;
1334 np = nnp;
1335 }
1336
1337 result = np - p;
1338
1339 g_free (t);
1340 g_free (p);
1341
1342 return result;
1343 }
1344
1345
1346
1347 static char *
1348 str_utf8_create_key_gen (const char *text, gboolean case_sen,
1349 gchar * (*keygen) (const gchar * text, gssize size))
1350 {
1351 char *result;
1352
1353 if (case_sen)
1354 result = str_utf8_normalize (text);
1355 else
1356 {
1357 gboolean dot;
1358 GString *fixed;
1359 const char *start, *end;
1360 char *fold, *key;
1361
1362 dot = text[0] == '.';
1363 fixed = g_string_sized_new (16);
1364
1365 if (!dot)
1366 start = text;
1367 else
1368 {
1369 start = text + 1;
1370 g_string_append_c (fixed, '.');
1371 }
1372
1373 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1374 {
1375 if (start != end)
1376 {
1377 fold = g_utf8_casefold (start, end - start);
1378 key = keygen (fold, -1);
1379 g_string_append (fixed, key);
1380 g_free (key);
1381 g_free (fold);
1382 }
1383 g_string_append_c (fixed, end[0]);
1384 start = end + 1;
1385 }
1386
1387 if (start == text)
1388 {
1389 fold = g_utf8_casefold (start, -1);
1390 result = keygen (fold, -1);
1391 g_free (fold);
1392 g_string_free (fixed, TRUE);
1393 }
1394 else if (dot && (start == text + 1))
1395 {
1396 fold = g_utf8_casefold (start, -1);
1397 key = keygen (fold, -1);
1398 g_string_append (fixed, key);
1399 g_free (key);
1400 g_free (fold);
1401 result = g_string_free (fixed, FALSE);
1402 }
1403 else
1404 {
1405 if (start[0] != '\0' && start != end)
1406 {
1407 fold = g_utf8_casefold (start, end - start);
1408 key = keygen (fold, -1);
1409 g_string_append (fixed, key);
1410 g_free (key);
1411 g_free (fold);
1412 }
1413 result = g_string_free (fixed, FALSE);
1414 }
1415 }
1416 return result;
1417 }
1418
1419
1420
1421 static char *
1422 str_utf8_create_key (const char *text, gboolean case_sen)
1423 {
1424 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1425 }
1426
1427
1428
1429 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1430 static char *
1431 str_utf8_create_key_for_filename (const char *text, gboolean case_sen)
1432 {
1433 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1434 }
1435 #endif
1436
1437
1438
1439 static int
1440 str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen)
1441 {
1442 (void) case_sen;
1443 return strcmp (t1, t2);
1444 }
1445
1446
1447
1448 static void
1449 str_utf8_release_key (char *key, gboolean case_sen)
1450 {
1451 (void) case_sen;
1452 g_free (key);
1453 }
1454
1455
1456
1457
1458
1459 struct str_class
1460 str_utf8_init (void)
1461 {
1462 struct str_class result;
1463
1464 result.conv_gerror_message = str_utf8_conv_gerror_message;
1465 result.vfs_convert_to = str_utf8_vfs_convert_to;
1466 result.insert_replace_char = str_utf8_insert_replace_char;
1467 result.is_valid_string = str_utf8_is_valid_string;
1468 result.is_valid_char = str_utf8_is_valid_char;
1469 result.cnext_char = str_utf8_cnext_char;
1470 result.cprev_char = str_utf8_cprev_char;
1471 result.cnext_char_safe = str_utf8_cnext_char_safe;
1472 result.cprev_char_safe = str_utf8_cprev_char_safe;
1473 result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1474 result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1475 result.char_isspace = str_utf8_isspace;
1476 result.char_ispunct = str_utf8_ispunct;
1477 result.char_isalnum = str_utf8_isalnum;
1478 result.char_isdigit = str_utf8_isdigit;
1479 result.char_isprint = str_utf8_isprint;
1480 result.char_iscombiningmark = str_utf8_iscombiningmark;
1481 result.char_toupper = str_utf8_toupper;
1482 result.char_tolower = str_utf8_tolower;
1483 result.length = str_utf8_length;
1484 result.length2 = str_utf8_length2;
1485 result.length_noncomb = str_utf8_length_noncomb;
1486 result.fix_string = str_utf8_fix_string;
1487 result.term_form = str_utf8_term_form;
1488 result.fit_to_term = str_utf8_fit_to_term;
1489 result.term_trim = str_utf8_term_trim;
1490 result.term_width2 = str_utf8_term_width2;
1491 result.term_width1 = str_utf8_term_width1;
1492 result.term_char_width = str_utf8_term_char_width;
1493 result.term_substring = str_utf8_term_substring;
1494 result.trunc = str_utf8_trunc;
1495 result.offset_to_pos = str_utf8_offset_to_pos;
1496 result.column_to_pos = str_utf8_column_to_pos;
1497 result.create_search_needle = str_utf8_create_search_needle;
1498 result.release_search_needle = str_utf8_release_search_needle;
1499 result.search_first = str_utf8_search_first;
1500 result.search_last = str_utf8_search_last;
1501 result.compare = str_utf8_compare;
1502 result.ncompare = str_utf8_ncompare;
1503 result.casecmp = str_utf8_casecmp;
1504 result.ncasecmp = str_utf8_ncasecmp;
1505 result.prefix = str_utf8_prefix;
1506 result.caseprefix = str_utf8_caseprefix;
1507 result.create_key = str_utf8_create_key;
1508 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1509
1510 result.create_key_for_filename = str_utf8_create_key_for_filename;
1511 #else
1512
1513 result.create_key_for_filename = str_utf8_create_key;
1514 #endif
1515 result.key_collate = str_utf8_key_collate;
1516 result.release_key = str_utf8_release_key;
1517
1518 return result;
1519 }
1520
1521