Commit 6408f79cce401e1bfecf923e7156f84f96e021e3
Committed by
David S. Miller
1 parent
df3fb93ad9
Exists in
master
and in
4 other branches
[LIB]: Naive finite state machine based textsearch
A finite state machine consists of n states (struct ts_fsm_token) representing the pattern as a finite automation. The data is read sequentially on a octet basis. Every state token specifies the number of recurrences and the type of value accepted which can be either a specific character or ctype based set of characters. The available type of recurrences include 1, (0|1), [0 n], and [1 n]. The algorithm differs between strict/non-strict mode specyfing whether the pattern has to start at the first octect. Strict mode is enabled by default and can be disabled by inserting TS_FSM_HEAD_IGNORE as the first token in the chain. The runtime performance of the algorithm should be around O(n), however while in strict mode the average runtime can be better. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
Showing 4 changed files with 398 additions and 0 deletions Side-by-side Diff
include/linux/textsearch_fsm.h
1 | +#ifndef __LINUX_TEXTSEARCH_FSM_H | |
2 | +#define __LINUX_TEXTSEARCH_FSM_H | |
3 | + | |
4 | +#include <linux/types.h> | |
5 | + | |
6 | +enum { | |
7 | + TS_FSM_SPECIFIC, /* specific character */ | |
8 | + TS_FSM_WILDCARD, /* any character */ | |
9 | + TS_FSM_DIGIT, /* isdigit() */ | |
10 | + TS_FSM_XDIGIT, /* isxdigit() */ | |
11 | + TS_FSM_PRINT, /* isprint() */ | |
12 | + TS_FSM_ALPHA, /* isalpha() */ | |
13 | + TS_FSM_ALNUM, /* isalnum() */ | |
14 | + TS_FSM_ASCII, /* isascii() */ | |
15 | + TS_FSM_CNTRL, /* iscntrl() */ | |
16 | + TS_FSM_GRAPH, /* isgraph() */ | |
17 | + TS_FSM_LOWER, /* islower() */ | |
18 | + TS_FSM_UPPER, /* isupper() */ | |
19 | + TS_FSM_PUNCT, /* ispunct() */ | |
20 | + TS_FSM_SPACE, /* isspace() */ | |
21 | + __TS_FSM_TYPE_MAX, | |
22 | +}; | |
23 | +#define TS_FSM_TYPE_MAX (__TS_FSM_TYPE_MAX - 1) | |
24 | + | |
25 | +enum { | |
26 | + TS_FSM_SINGLE, /* 1 occurrence */ | |
27 | + TS_FSM_PERHAPS, /* 1 or 0 occurrence */ | |
28 | + TS_FSM_ANY, /* 0..n occurrences */ | |
29 | + TS_FSM_MULTI, /* 1..n occurrences */ | |
30 | + TS_FSM_HEAD_IGNORE, /* 0..n ignored occurrences at head */ | |
31 | + __TS_FSM_RECUR_MAX, | |
32 | +}; | |
33 | +#define TS_FSM_RECUR_MAX (__TS_FSM_RECUR_MAX - 1) | |
34 | + | |
35 | +/** | |
36 | + * struct ts_fsm_token - state machine token (state) | |
37 | + * @type: type of token | |
38 | + * @recur: number of recurrences | |
39 | + * @value: character value for TS_FSM_SPECIFIC | |
40 | + */ | |
41 | +struct ts_fsm_token | |
42 | +{ | |
43 | + __u16 type; | |
44 | + __u8 recur; | |
45 | + __u8 value; | |
46 | +}; | |
47 | + | |
48 | +#endif |
lib/Kconfig
... | ... | @@ -80,5 +80,16 @@ |
80 | 80 | To compile this code as a module, choose M here: the |
81 | 81 | module will be called ts_kmp. |
82 | 82 | |
83 | +config TEXTSEARCH_FSM | |
84 | + depends on TEXTSEARCH | |
85 | + tristate "Finite state machine" | |
86 | + help | |
87 | + Say Y here if you want to be able to search text using a | |
88 | + naive finite state machine approach implementing a subset | |
89 | + of regular expressions. | |
90 | + | |
91 | + To compile this code as a module, choose M here: the | |
92 | + module will be called ts_fsm. | |
93 | + | |
83 | 94 | endmenu |
lib/Makefile
lib/ts_fsm.c
1 | +/* | |
2 | + * lib/ts_fsm.c A naive finite state machine text search approach | |
3 | + * | |
4 | + * This program is free software; you can redistribute it and/or | |
5 | + * modify it under the terms of the GNU General Public License | |
6 | + * as published by the Free Software Foundation; either version | |
7 | + * 2 of the License, or (at your option) any later version. | |
8 | + * | |
9 | + * Authors: Thomas Graf <tgraf@suug.ch> | |
10 | + * | |
11 | + * ========================================================================== | |
12 | + * | |
13 | + * A finite state machine consists of n states (struct ts_fsm_token) | |
14 | + * representing the pattern as a finite automation. The data is read | |
15 | + * sequentially on a octet basis. Every state token specifies the number | |
16 | + * of recurrences and the type of value accepted which can be either a | |
17 | + * specific character or ctype based set of characters. The available | |
18 | + * type of recurrences include 1, (0|1), [0 n], and [1 n]. | |
19 | + * | |
20 | + * The algorithm differs between strict/non-strict mode specyfing | |
21 | + * whether the pattern has to start at the first octect. Strict mode | |
22 | + * is enabled by default and can be disabled by inserting | |
23 | + * TS_FSM_HEAD_IGNORE as the first token in the chain. | |
24 | + * | |
25 | + * The runtime performance of the algorithm should be around O(n), | |
26 | + * however while in strict mode the average runtime can be better. | |
27 | + */ | |
28 | + | |
29 | +#include <linux/config.h> | |
30 | +#include <linux/module.h> | |
31 | +#include <linux/types.h> | |
32 | +#include <linux/string.h> | |
33 | +#include <linux/ctype.h> | |
34 | +#include <linux/textsearch.h> | |
35 | +#include <linux/textsearch_fsm.h> | |
36 | + | |
37 | +struct ts_fsm | |
38 | +{ | |
39 | + unsigned int ntokens; | |
40 | + struct ts_fsm_token tokens[0]; | |
41 | +}; | |
42 | + | |
43 | +/* other values derived from ctype.h */ | |
44 | +#define _A 0x100 /* ascii */ | |
45 | +#define _W 0x200 /* wildcard */ | |
46 | + | |
47 | +/* Map to _ctype flags and some magic numbers */ | |
48 | +static u16 token_map[TS_FSM_TYPE_MAX+1] = { | |
49 | + [TS_FSM_SPECIFIC] = 0, | |
50 | + [TS_FSM_WILDCARD] = _W, | |
51 | + [TS_FSM_CNTRL] = _C, | |
52 | + [TS_FSM_LOWER] = _L, | |
53 | + [TS_FSM_UPPER] = _U, | |
54 | + [TS_FSM_PUNCT] = _P, | |
55 | + [TS_FSM_SPACE] = _S, | |
56 | + [TS_FSM_DIGIT] = _D, | |
57 | + [TS_FSM_XDIGIT] = _D | _X, | |
58 | + [TS_FSM_ALPHA] = _U | _L, | |
59 | + [TS_FSM_ALNUM] = _U | _L | _D, | |
60 | + [TS_FSM_PRINT] = _P | _U | _L | _D | _SP, | |
61 | + [TS_FSM_GRAPH] = _P | _U | _L | _D, | |
62 | + [TS_FSM_ASCII] = _A, | |
63 | +}; | |
64 | + | |
65 | +static u16 token_lookup_tbl[256] = { | |
66 | +_W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 0- 3 */ | |
67 | +_W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 4- 7 */ | |
68 | +_W|_A|_C, _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C|_S, /* 8- 11 */ | |
69 | +_W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C, _W|_A|_C, /* 12- 15 */ | |
70 | +_W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 16- 19 */ | |
71 | +_W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 20- 23 */ | |
72 | +_W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 24- 27 */ | |
73 | +_W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 28- 31 */ | |
74 | +_W|_A|_S|_SP, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 32- 35 */ | |
75 | +_W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 36- 39 */ | |
76 | +_W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 40- 43 */ | |
77 | +_W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 44- 47 */ | |
78 | +_W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 48- 51 */ | |
79 | +_W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 52- 55 */ | |
80 | +_W|_A|_D, _W|_A|_D, _W|_A|_P, _W|_A|_P, /* 56- 59 */ | |
81 | +_W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 60- 63 */ | |
82 | +_W|_A|_P, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, /* 64- 67 */ | |
83 | +_W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U, /* 68- 71 */ | |
84 | +_W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 72- 75 */ | |
85 | +_W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 76- 79 */ | |
86 | +_W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 80- 83 */ | |
87 | +_W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 84- 87 */ | |
88 | +_W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_P, /* 88- 91 */ | |
89 | +_W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 92- 95 */ | |
90 | +_W|_A|_P, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, /* 96- 99 */ | |
91 | +_W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L, /* 100-103 */ | |
92 | +_W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 104-107 */ | |
93 | +_W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 108-111 */ | |
94 | +_W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 112-115 */ | |
95 | +_W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 116-119 */ | |
96 | +_W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_P, /* 120-123 */ | |
97 | +_W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_C, /* 124-127 */ | |
98 | +_W, _W, _W, _W, /* 128-131 */ | |
99 | +_W, _W, _W, _W, /* 132-135 */ | |
100 | +_W, _W, _W, _W, /* 136-139 */ | |
101 | +_W, _W, _W, _W, /* 140-143 */ | |
102 | +_W, _W, _W, _W, /* 144-147 */ | |
103 | +_W, _W, _W, _W, /* 148-151 */ | |
104 | +_W, _W, _W, _W, /* 152-155 */ | |
105 | +_W, _W, _W, _W, /* 156-159 */ | |
106 | +_W|_S|_SP, _W|_P, _W|_P, _W|_P, /* 160-163 */ | |
107 | +_W|_P, _W|_P, _W|_P, _W|_P, /* 164-167 */ | |
108 | +_W|_P, _W|_P, _W|_P, _W|_P, /* 168-171 */ | |
109 | +_W|_P, _W|_P, _W|_P, _W|_P, /* 172-175 */ | |
110 | +_W|_P, _W|_P, _W|_P, _W|_P, /* 176-179 */ | |
111 | +_W|_P, _W|_P, _W|_P, _W|_P, /* 180-183 */ | |
112 | +_W|_P, _W|_P, _W|_P, _W|_P, /* 184-187 */ | |
113 | +_W|_P, _W|_P, _W|_P, _W|_P, /* 188-191 */ | |
114 | +_W|_U, _W|_U, _W|_U, _W|_U, /* 192-195 */ | |
115 | +_W|_U, _W|_U, _W|_U, _W|_U, /* 196-199 */ | |
116 | +_W|_U, _W|_U, _W|_U, _W|_U, /* 200-203 */ | |
117 | +_W|_U, _W|_U, _W|_U, _W|_U, /* 204-207 */ | |
118 | +_W|_U, _W|_U, _W|_U, _W|_U, /* 208-211 */ | |
119 | +_W|_U, _W|_U, _W|_U, _W|_P, /* 212-215 */ | |
120 | +_W|_U, _W|_U, _W|_U, _W|_U, /* 216-219 */ | |
121 | +_W|_U, _W|_U, _W|_U, _W|_L, /* 220-223 */ | |
122 | +_W|_L, _W|_L, _W|_L, _W|_L, /* 224-227 */ | |
123 | +_W|_L, _W|_L, _W|_L, _W|_L, /* 228-231 */ | |
124 | +_W|_L, _W|_L, _W|_L, _W|_L, /* 232-235 */ | |
125 | +_W|_L, _W|_L, _W|_L, _W|_L, /* 236-239 */ | |
126 | +_W|_L, _W|_L, _W|_L, _W|_L, /* 240-243 */ | |
127 | +_W|_L, _W|_L, _W|_L, _W|_P, /* 244-247 */ | |
128 | +_W|_L, _W|_L, _W|_L, _W|_L, /* 248-251 */ | |
129 | +_W|_L, _W|_L, _W|_L, _W|_L}; /* 252-255 */ | |
130 | + | |
131 | +static inline int match_token(struct ts_fsm_token *t, u8 d) | |
132 | +{ | |
133 | + if (t->type) | |
134 | + return (token_lookup_tbl[d] & t->type) != 0; | |
135 | + else | |
136 | + return t->value == d; | |
137 | +} | |
138 | + | |
139 | +static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state) | |
140 | +{ | |
141 | + struct ts_fsm *fsm = ts_config_priv(conf); | |
142 | + struct ts_fsm_token *cur = NULL, *next; | |
143 | + unsigned int match_start, block_idx = 0, tok_idx; | |
144 | + unsigned block_len = 0, strict, consumed = state->offset; | |
145 | + const u8 *data; | |
146 | + | |
147 | +#define GET_NEXT_BLOCK() \ | |
148 | +({ consumed += block_idx; \ | |
149 | + block_idx = 0; \ | |
150 | + block_len = conf->get_next_block(consumed, &data, conf, state); }) | |
151 | + | |
152 | +#define TOKEN_MISMATCH() \ | |
153 | + do { \ | |
154 | + if (strict) \ | |
155 | + goto no_match; \ | |
156 | + block_idx++; \ | |
157 | + goto startover; \ | |
158 | + } while(0) | |
159 | + | |
160 | +#define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK()) | |
161 | + | |
162 | + if (end_of_data()) | |
163 | + goto no_match; | |
164 | + | |
165 | + strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE; | |
166 | + | |
167 | +startover: | |
168 | + match_start = consumed + block_idx; | |
169 | + | |
170 | + for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) { | |
171 | + cur = &fsm->tokens[tok_idx]; | |
172 | + | |
173 | + if (likely(tok_idx < (fsm->ntokens - 1))) | |
174 | + next = &fsm->tokens[tok_idx + 1]; | |
175 | + else | |
176 | + next = NULL; | |
177 | + | |
178 | + switch (cur->recur) { | |
179 | + case TS_FSM_SINGLE: | |
180 | + if (end_of_data()) | |
181 | + goto no_match; | |
182 | + | |
183 | + if (!match_token(cur, data[block_idx])) | |
184 | + TOKEN_MISMATCH(); | |
185 | + break; | |
186 | + | |
187 | + case TS_FSM_PERHAPS: | |
188 | + if (end_of_data() || | |
189 | + !match_token(cur, data[block_idx])) | |
190 | + continue; | |
191 | + break; | |
192 | + | |
193 | + case TS_FSM_MULTI: | |
194 | + if (end_of_data()) | |
195 | + goto no_match; | |
196 | + | |
197 | + if (!match_token(cur, data[block_idx])) | |
198 | + TOKEN_MISMATCH(); | |
199 | + | |
200 | + block_idx++; | |
201 | + /* fall through */ | |
202 | + | |
203 | + case TS_FSM_ANY: | |
204 | + if (next == NULL) | |
205 | + goto found_match; | |
206 | + | |
207 | + if (end_of_data()) | |
208 | + continue; | |
209 | + | |
210 | + while (!match_token(next, data[block_idx])) { | |
211 | + if (!match_token(cur, data[block_idx])) | |
212 | + TOKEN_MISMATCH(); | |
213 | + block_idx++; | |
214 | + if (end_of_data()) | |
215 | + goto no_match; | |
216 | + } | |
217 | + continue; | |
218 | + | |
219 | + /* | |
220 | + * Optimization: Prefer small local loop over jumping | |
221 | + * back and forth until garbage at head is munched. | |
222 | + */ | |
223 | + case TS_FSM_HEAD_IGNORE: | |
224 | + if (end_of_data()) | |
225 | + continue; | |
226 | + | |
227 | + while (!match_token(next, data[block_idx])) { | |
228 | + /* | |
229 | + * Special case, don't start over upon | |
230 | + * a mismatch, give the user the | |
231 | + * chance to specify the type of data | |
232 | + * allowed to be ignored. | |
233 | + */ | |
234 | + if (!match_token(cur, data[block_idx])) | |
235 | + goto no_match; | |
236 | + | |
237 | + block_idx++; | |
238 | + if (end_of_data()) | |
239 | + goto no_match; | |
240 | + } | |
241 | + | |
242 | + match_start = consumed + block_idx; | |
243 | + continue; | |
244 | + } | |
245 | + | |
246 | + block_idx++; | |
247 | + } | |
248 | + | |
249 | + if (end_of_data()) | |
250 | + goto found_match; | |
251 | + | |
252 | +no_match: | |
253 | + return UINT_MAX; | |
254 | + | |
255 | +found_match: | |
256 | + state->offset = consumed + block_idx; | |
257 | + return match_start; | |
258 | +} | |
259 | + | |
260 | +static struct ts_config *fsm_init(const void *pattern, unsigned int len, | |
261 | + int gfp_mask) | |
262 | +{ | |
263 | + int i, err = -EINVAL; | |
264 | + struct ts_config *conf; | |
265 | + struct ts_fsm *fsm; | |
266 | + struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern; | |
267 | + unsigned int ntokens = len / sizeof(*tokens); | |
268 | + size_t priv_size = sizeof(*fsm) + len; | |
269 | + | |
270 | + if (len % sizeof(struct ts_fsm_token) || ntokens < 1) | |
271 | + goto errout; | |
272 | + | |
273 | + for (i = 0; i < ntokens; i++) { | |
274 | + struct ts_fsm_token *t = &tokens[i]; | |
275 | + | |
276 | + if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX) | |
277 | + goto errout; | |
278 | + | |
279 | + if (t->recur == TS_FSM_HEAD_IGNORE && | |
280 | + (i != 0 || i == (ntokens - 1))) | |
281 | + goto errout; | |
282 | + } | |
283 | + | |
284 | + conf = alloc_ts_config(priv_size, gfp_mask); | |
285 | + if (IS_ERR(conf)) | |
286 | + return conf; | |
287 | + | |
288 | + fsm = ts_config_priv(conf); | |
289 | + fsm->ntokens = ntokens; | |
290 | + memcpy(fsm->tokens, pattern, len); | |
291 | + | |
292 | + for (i = 0; i < fsm->ntokens; i++) { | |
293 | + struct ts_fsm_token *t = &fsm->tokens[i]; | |
294 | + t->type = token_map[t->type]; | |
295 | + } | |
296 | + | |
297 | + return conf; | |
298 | + | |
299 | +errout: | |
300 | + return ERR_PTR(err); | |
301 | +} | |
302 | + | |
303 | +static void *fsm_get_pattern(struct ts_config *conf) | |
304 | +{ | |
305 | + struct ts_fsm *fsm = ts_config_priv(conf); | |
306 | + return fsm->tokens; | |
307 | +} | |
308 | + | |
309 | +static unsigned int fsm_get_pattern_len(struct ts_config *conf) | |
310 | +{ | |
311 | + struct ts_fsm *fsm = ts_config_priv(conf); | |
312 | + return fsm->ntokens * sizeof(struct ts_fsm_token); | |
313 | +} | |
314 | + | |
315 | +static struct ts_ops fsm_ops = { | |
316 | + .name = "fsm", | |
317 | + .find = fsm_find, | |
318 | + .init = fsm_init, | |
319 | + .get_pattern = fsm_get_pattern, | |
320 | + .get_pattern_len = fsm_get_pattern_len, | |
321 | + .owner = THIS_MODULE, | |
322 | + .list = LIST_HEAD_INIT(fsm_ops.list) | |
323 | +}; | |
324 | + | |
325 | +static int __init init_fsm(void) | |
326 | +{ | |
327 | + return textsearch_register(&fsm_ops); | |
328 | +} | |
329 | + | |
330 | +static void __exit exit_fsm(void) | |
331 | +{ | |
332 | + textsearch_unregister(&fsm_ops); | |
333 | +} | |
334 | + | |
335 | +MODULE_LICENSE("GPL"); | |
336 | + | |
337 | +module_init(init_fsm); | |
338 | +module_exit(exit_fsm); |