1 /*
2 Search text engine.
3 HEX-style pattern matching
4
5 Copyright (C) 2009-2025
6 Free Software Foundation, Inc.
7
8 Written by:
9 Slava Zanko <slavazanko@gmail.com>, 2009.
10
11 This file is part of the Midnight Commander.
12
13 The Midnight Commander is free software: you can redistribute it
14 and/or modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation, either version 3 of the License,
16 or (at your option) any later version.
17
18 The Midnight Commander is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program. If not, see <https://www.gnu.org/licenses/>.
25 */
26
27 #include <config.h>
28
29 #include <stdio.h>
30
31 #include "lib/global.h"
32 #include "lib/strutil.h"
33 #include "lib/search.h"
34
35 #include "internal.h"
36
37 /*** global variables ****************************************************************************/
38
39 /*** file scope macro definitions ****************************************************************/
40
41 typedef enum
42 {
43 MC_SEARCH_HEX_E_OK,
44 MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE,
45 MC_SEARCH_HEX_E_INVALID_CHARACTER,
46 MC_SEARCH_HEX_E_UNMATCHED_QUOTES
47 } mc_search_hex_parse_error_t;
48
49 /*** file scope type declarations ****************************************************************/
50
51 /*** forward declarations (file scope functions) *************************************************/
52
53 /*** file scope variables ************************************************************************/
54
55 /* --------------------------------------------------------------------------------------------- */
56 /*** file scope functions ************************************************************************/
57 /* --------------------------------------------------------------------------------------------- */
58
59 static GString *
60 mc_search__hex_translate_to_regex (const GString *astr, mc_search_hex_parse_error_t *error_ptr,
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
61 int *error_pos_ptr)
62 {
63 GString *buff;
64 const char *str;
65 gsize str_len;
66 gsize loop = 0;
67 mc_search_hex_parse_error_t error = MC_SEARCH_HEX_E_OK;
68
69 buff = g_string_sized_new (64);
70 str = astr->str;
71 str_len = astr->len;
72
73 while (loop < str_len && error == MC_SEARCH_HEX_E_OK)
74 {
75 unsigned int val;
76 int ptr;
77
78 if (g_ascii_isspace (str[loop]))
79 {
80 // Eat-up whitespace between tokens.
81 while (g_ascii_isspace (str[loop]))
82 loop++;
83 }
84 else if (sscanf (str + loop, "%x%n", &val, &ptr) == 1)
85 {
86 if (val > 255)
87 error = MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE;
88 else
89 {
90 g_string_append_printf (buff, "\\x%02X", val);
91 loop += ptr;
92 }
93 }
94 else if (str[loop] == '"')
95 {
96 gsize loop2;
97
98 loop2 = loop + 1;
99
100 while (loop2 < str_len)
101 {
102 if (str[loop2] == '"')
103 break;
104 if (str[loop2] == '\\' && loop2 + 1 < str_len)
105 loop2++;
106 g_string_append_c (buff, str[loop2]);
107 loop2++;
108 }
109
110 if (str[loop2] == '\0')
111 error = MC_SEARCH_HEX_E_UNMATCHED_QUOTES;
112 else
113 loop = loop2 + 1;
114 }
115 else
116 error = MC_SEARCH_HEX_E_INVALID_CHARACTER;
117 }
118
119 if (error != MC_SEARCH_HEX_E_OK)
120 {
121 g_string_free (buff, TRUE);
122 if (error_ptr != NULL)
123 *error_ptr = error;
124 if (error_pos_ptr != NULL)
125 *error_pos_ptr = loop;
126 return NULL;
127 }
128
129 return buff;
130 }
131
132 /* --------------------------------------------------------------------------------------------- */
133 /*** public functions ****************************************************************************/
134 /* --------------------------------------------------------------------------------------------- */
135
136 void
137 mc_search__cond_struct_new_init_hex (const char *charset, mc_search_t *lc_mc_search,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
138 mc_search_cond_t *mc_search_cond)
139 {
140 GString *tmp;
141 mc_search_hex_parse_error_t error = MC_SEARCH_HEX_E_OK;
142 int error_pos = 0;
143
144 /*
145 * We may be searching in binary data, which is often invalid UTF-8.
146 *
147 * We have to create a non UTF-8 regex (that is, G_REGEX_RAW) or else, as
148 * the data is invalid UTF-8, both GLib's PCRE and our
149 * mc_search__g_regex_match_full_safe() are going to fail us. The former by
150 * not finding all bytes, the latter by overwriting the supposedly invalid
151 * UTF-8 with NULs.
152 *
153 * To do this, we specify "ASCII" as the charset.
154 *
155 * In fact, we can specify any charset other than "UTF-8": any such charset
156 * will trigger G_REGEX_RAW (see [1]). The output of [2] will be the same
157 * for all charsets because it skips the \xXX symbols
158 * mc_search__hex_translate_to_regex() outputs.
159 *
160 * But "ASCII" is the best choice because a hex pattern may contain a
161 * quoted string: this way we know [2] will ignore any characters outside
162 * ASCII letters range (these ignored chars will be copied verbatim to the
163 * output and will match as-is; in other words, in a case-sensitive manner;
164 * If the user is interested in case-insensitive searches of international
165 * text, he shouldn't be using hex search in the first place.)
166 *
167 * Switching out of UTF-8 has another advantage:
168 *
169 * When doing case-insensitive searches, GLib treats \xXX symbols as normal
170 * letters and therefore matches both "a" and "A" for the hex pattern
171 * "0x61". When we switch out of UTF-8, we're switching to using [2], which
172 * doesn't have this issue.
173 *
174 * [1] mc_search__cond_struct_new_init_regex
175 * [2] mc_search__cond_struct_new_regex_ci_str
176 */
177 if (str_isutf8 (charset))
178 charset = "ASCII";
179
180 tmp = mc_search__hex_translate_to_regex (mc_search_cond->str, &error, &error_pos);
181 if (tmp != NULL)
182 {
183 g_string_free (mc_search_cond->str, TRUE);
184 mc_search_cond->str = tmp;
185 mc_search__cond_struct_new_init_regex (charset, lc_mc_search, mc_search_cond);
186 }
187 else
188 {
189 const char *desc;
190
191 switch (error)
192 {
193 case MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE:
194 desc = _ (
195 "Number out of range (should be in byte range, 0 <= n <= 0xFF, expressed in hex)");
196 break;
197 case MC_SEARCH_HEX_E_INVALID_CHARACTER:
198 desc = _ ("Invalid character");
199 break;
200 case MC_SEARCH_HEX_E_UNMATCHED_QUOTES:
201 desc = _ ("Unmatched quotes character");
202 break;
203 default:
204 desc = "";
205 }
206
207 lc_mc_search->error = MC_SEARCH_E_INPUT;
208 lc_mc_search->error_str =
209 g_strdup_printf (_ ("Hex pattern error at position %d:\n%s."), error_pos + 1, desc);
210 }
211 }
212
213 /* --------------------------------------------------------------------------------------------- */
214
215 gboolean
216 mc_search__run_hex (mc_search_t *lc_mc_search, const void *user_data, off_t start_search,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
217 off_t end_search, gsize *found_len)
218 {
219 return mc_search__run_regex (lc_mc_search, user_data, start_search, end_search, found_len);
220 }
221
222 /* --------------------------------------------------------------------------------------------- */
223
224 GString *
225 mc_search_hex_prepare_replace_str (mc_search_t *lc_mc_search, GString *replace_str)
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
226 {
227 (void) lc_mc_search;
228
229 return mc_g_string_dup (replace_str);
230 }
231
232 /* --------------------------------------------------------------------------------------------- */