Commit 6408f79cce401e1bfecf923e7156f84f96e021e3

Authored by Thomas Graf
Committed by David S. Miller
1 parent df3fb93ad9

[LIB]: Naive finite state machine based textsearch

A finite state machine consists of n states (struct ts_fsm_token)
representing the pattern as a finite automation. The data is read
sequentially on a octet basis. Every state token specifies the number
of recurrences and the type of value accepted which can be either a
specific character or ctype based set of characters. The available
type of recurrences include 1, (0|1), [0 n], and [1 n].

The algorithm differs between strict/non-strict mode specyfing
whether the pattern has to start at the first octect. Strict mode
is enabled by default and can be disabled by inserting
TS_FSM_HEAD_IGNORE as the first token in the chain.

The runtime performance of the algorithm should be around O(n),
however while in strict mode the average runtime can be better.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 4 changed files with 398 additions and 0 deletions Side-by-side Diff

include/linux/textsearch_fsm.h
  1 +#ifndef __LINUX_TEXTSEARCH_FSM_H
  2 +#define __LINUX_TEXTSEARCH_FSM_H
  3 +
  4 +#include <linux/types.h>
  5 +
  6 +enum {
  7 + TS_FSM_SPECIFIC, /* specific character */
  8 + TS_FSM_WILDCARD, /* any character */
  9 + TS_FSM_DIGIT, /* isdigit() */
  10 + TS_FSM_XDIGIT, /* isxdigit() */
  11 + TS_FSM_PRINT, /* isprint() */
  12 + TS_FSM_ALPHA, /* isalpha() */
  13 + TS_FSM_ALNUM, /* isalnum() */
  14 + TS_FSM_ASCII, /* isascii() */
  15 + TS_FSM_CNTRL, /* iscntrl() */
  16 + TS_FSM_GRAPH, /* isgraph() */
  17 + TS_FSM_LOWER, /* islower() */
  18 + TS_FSM_UPPER, /* isupper() */
  19 + TS_FSM_PUNCT, /* ispunct() */
  20 + TS_FSM_SPACE, /* isspace() */
  21 + __TS_FSM_TYPE_MAX,
  22 +};
  23 +#define TS_FSM_TYPE_MAX (__TS_FSM_TYPE_MAX - 1)
  24 +
  25 +enum {
  26 + TS_FSM_SINGLE, /* 1 occurrence */
  27 + TS_FSM_PERHAPS, /* 1 or 0 occurrence */
  28 + TS_FSM_ANY, /* 0..n occurrences */
  29 + TS_FSM_MULTI, /* 1..n occurrences */
  30 + TS_FSM_HEAD_IGNORE, /* 0..n ignored occurrences at head */
  31 + __TS_FSM_RECUR_MAX,
  32 +};
  33 +#define TS_FSM_RECUR_MAX (__TS_FSM_RECUR_MAX - 1)
  34 +
  35 +/**
  36 + * struct ts_fsm_token - state machine token (state)
  37 + * @type: type of token
  38 + * @recur: number of recurrences
  39 + * @value: character value for TS_FSM_SPECIFIC
  40 + */
  41 +struct ts_fsm_token
  42 +{
  43 + __u16 type;
  44 + __u8 recur;
  45 + __u8 value;
  46 +};
  47 +
  48 +#endif
... ... @@ -80,5 +80,16 @@
80 80 To compile this code as a module, choose M here: the
81 81 module will be called ts_kmp.
82 82  
  83 +config TEXTSEARCH_FSM
  84 + depends on TEXTSEARCH
  85 + tristate "Finite state machine"
  86 + help
  87 + Say Y here if you want to be able to search text using a
  88 + naive finite state machine approach implementing a subset
  89 + of regular expressions.
  90 +
  91 + To compile this code as a module, choose M here: the
  92 + module will be called ts_fsm.
  93 +
83 94 endmenu
... ... @@ -38,6 +38,7 @@
38 38  
39 39 lib-$(CONFIG_TEXTSEARCH) += textsearch.o
40 40 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
  41 +obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
41 42  
42 43 hostprogs-y := gen_crc32table
43 44 clean-files := crc32table.h
  1 +/*
  2 + * lib/ts_fsm.c A naive finite state machine text search approach
  3 + *
  4 + * This program is free software; you can redistribute it and/or
  5 + * modify it under the terms of the GNU General Public License
  6 + * as published by the Free Software Foundation; either version
  7 + * 2 of the License, or (at your option) any later version.
  8 + *
  9 + * Authors: Thomas Graf <tgraf@suug.ch>
  10 + *
  11 + * ==========================================================================
  12 + *
  13 + * A finite state machine consists of n states (struct ts_fsm_token)
  14 + * representing the pattern as a finite automation. The data is read
  15 + * sequentially on a octet basis. Every state token specifies the number
  16 + * of recurrences and the type of value accepted which can be either a
  17 + * specific character or ctype based set of characters. The available
  18 + * type of recurrences include 1, (0|1), [0 n], and [1 n].
  19 + *
  20 + * The algorithm differs between strict/non-strict mode specyfing
  21 + * whether the pattern has to start at the first octect. Strict mode
  22 + * is enabled by default and can be disabled by inserting
  23 + * TS_FSM_HEAD_IGNORE as the first token in the chain.
  24 + *
  25 + * The runtime performance of the algorithm should be around O(n),
  26 + * however while in strict mode the average runtime can be better.
  27 + */
  28 +
  29 +#include <linux/config.h>
  30 +#include <linux/module.h>
  31 +#include <linux/types.h>
  32 +#include <linux/string.h>
  33 +#include <linux/ctype.h>
  34 +#include <linux/textsearch.h>
  35 +#include <linux/textsearch_fsm.h>
  36 +
  37 +struct ts_fsm
  38 +{
  39 + unsigned int ntokens;
  40 + struct ts_fsm_token tokens[0];
  41 +};
  42 +
  43 +/* other values derived from ctype.h */
  44 +#define _A 0x100 /* ascii */
  45 +#define _W 0x200 /* wildcard */
  46 +
  47 +/* Map to _ctype flags and some magic numbers */
  48 +static u16 token_map[TS_FSM_TYPE_MAX+1] = {
  49 + [TS_FSM_SPECIFIC] = 0,
  50 + [TS_FSM_WILDCARD] = _W,
  51 + [TS_FSM_CNTRL] = _C,
  52 + [TS_FSM_LOWER] = _L,
  53 + [TS_FSM_UPPER] = _U,
  54 + [TS_FSM_PUNCT] = _P,
  55 + [TS_FSM_SPACE] = _S,
  56 + [TS_FSM_DIGIT] = _D,
  57 + [TS_FSM_XDIGIT] = _D | _X,
  58 + [TS_FSM_ALPHA] = _U | _L,
  59 + [TS_FSM_ALNUM] = _U | _L | _D,
  60 + [TS_FSM_PRINT] = _P | _U | _L | _D | _SP,
  61 + [TS_FSM_GRAPH] = _P | _U | _L | _D,
  62 + [TS_FSM_ASCII] = _A,
  63 +};
  64 +
  65 +static u16 token_lookup_tbl[256] = {
  66 +_W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 0- 3 */
  67 +_W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 4- 7 */
  68 +_W|_A|_C, _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C|_S, /* 8- 11 */
  69 +_W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C, _W|_A|_C, /* 12- 15 */
  70 +_W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 16- 19 */
  71 +_W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 20- 23 */
  72 +_W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 24- 27 */
  73 +_W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 28- 31 */
  74 +_W|_A|_S|_SP, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 32- 35 */
  75 +_W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 36- 39 */
  76 +_W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 40- 43 */
  77 +_W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 44- 47 */
  78 +_W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 48- 51 */
  79 +_W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 52- 55 */
  80 +_W|_A|_D, _W|_A|_D, _W|_A|_P, _W|_A|_P, /* 56- 59 */
  81 +_W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 60- 63 */
  82 +_W|_A|_P, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, /* 64- 67 */
  83 +_W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U, /* 68- 71 */
  84 +_W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 72- 75 */
  85 +_W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 76- 79 */
  86 +_W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 80- 83 */
  87 +_W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 84- 87 */
  88 +_W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_P, /* 88- 91 */
  89 +_W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 92- 95 */
  90 +_W|_A|_P, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, /* 96- 99 */
  91 +_W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L, /* 100-103 */
  92 +_W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 104-107 */
  93 +_W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 108-111 */
  94 +_W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 112-115 */
  95 +_W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 116-119 */
  96 +_W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_P, /* 120-123 */
  97 +_W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_C, /* 124-127 */
  98 +_W, _W, _W, _W, /* 128-131 */
  99 +_W, _W, _W, _W, /* 132-135 */
  100 +_W, _W, _W, _W, /* 136-139 */
  101 +_W, _W, _W, _W, /* 140-143 */
  102 +_W, _W, _W, _W, /* 144-147 */
  103 +_W, _W, _W, _W, /* 148-151 */
  104 +_W, _W, _W, _W, /* 152-155 */
  105 +_W, _W, _W, _W, /* 156-159 */
  106 +_W|_S|_SP, _W|_P, _W|_P, _W|_P, /* 160-163 */
  107 +_W|_P, _W|_P, _W|_P, _W|_P, /* 164-167 */
  108 +_W|_P, _W|_P, _W|_P, _W|_P, /* 168-171 */
  109 +_W|_P, _W|_P, _W|_P, _W|_P, /* 172-175 */
  110 +_W|_P, _W|_P, _W|_P, _W|_P, /* 176-179 */
  111 +_W|_P, _W|_P, _W|_P, _W|_P, /* 180-183 */
  112 +_W|_P, _W|_P, _W|_P, _W|_P, /* 184-187 */
  113 +_W|_P, _W|_P, _W|_P, _W|_P, /* 188-191 */
  114 +_W|_U, _W|_U, _W|_U, _W|_U, /* 192-195 */
  115 +_W|_U, _W|_U, _W|_U, _W|_U, /* 196-199 */
  116 +_W|_U, _W|_U, _W|_U, _W|_U, /* 200-203 */
  117 +_W|_U, _W|_U, _W|_U, _W|_U, /* 204-207 */
  118 +_W|_U, _W|_U, _W|_U, _W|_U, /* 208-211 */
  119 +_W|_U, _W|_U, _W|_U, _W|_P, /* 212-215 */
  120 +_W|_U, _W|_U, _W|_U, _W|_U, /* 216-219 */
  121 +_W|_U, _W|_U, _W|_U, _W|_L, /* 220-223 */
  122 +_W|_L, _W|_L, _W|_L, _W|_L, /* 224-227 */
  123 +_W|_L, _W|_L, _W|_L, _W|_L, /* 228-231 */
  124 +_W|_L, _W|_L, _W|_L, _W|_L, /* 232-235 */
  125 +_W|_L, _W|_L, _W|_L, _W|_L, /* 236-239 */
  126 +_W|_L, _W|_L, _W|_L, _W|_L, /* 240-243 */
  127 +_W|_L, _W|_L, _W|_L, _W|_P, /* 244-247 */
  128 +_W|_L, _W|_L, _W|_L, _W|_L, /* 248-251 */
  129 +_W|_L, _W|_L, _W|_L, _W|_L}; /* 252-255 */
  130 +
  131 +static inline int match_token(struct ts_fsm_token *t, u8 d)
  132 +{
  133 + if (t->type)
  134 + return (token_lookup_tbl[d] & t->type) != 0;
  135 + else
  136 + return t->value == d;
  137 +}
  138 +
  139 +static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state)
  140 +{
  141 + struct ts_fsm *fsm = ts_config_priv(conf);
  142 + struct ts_fsm_token *cur = NULL, *next;
  143 + unsigned int match_start, block_idx = 0, tok_idx;
  144 + unsigned block_len = 0, strict, consumed = state->offset;
  145 + const u8 *data;
  146 +
  147 +#define GET_NEXT_BLOCK() \
  148 +({ consumed += block_idx; \
  149 + block_idx = 0; \
  150 + block_len = conf->get_next_block(consumed, &data, conf, state); })
  151 +
  152 +#define TOKEN_MISMATCH() \
  153 + do { \
  154 + if (strict) \
  155 + goto no_match; \
  156 + block_idx++; \
  157 + goto startover; \
  158 + } while(0)
  159 +
  160 +#define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK())
  161 +
  162 + if (end_of_data())
  163 + goto no_match;
  164 +
  165 + strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE;
  166 +
  167 +startover:
  168 + match_start = consumed + block_idx;
  169 +
  170 + for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) {
  171 + cur = &fsm->tokens[tok_idx];
  172 +
  173 + if (likely(tok_idx < (fsm->ntokens - 1)))
  174 + next = &fsm->tokens[tok_idx + 1];
  175 + else
  176 + next = NULL;
  177 +
  178 + switch (cur->recur) {
  179 + case TS_FSM_SINGLE:
  180 + if (end_of_data())
  181 + goto no_match;
  182 +
  183 + if (!match_token(cur, data[block_idx]))
  184 + TOKEN_MISMATCH();
  185 + break;
  186 +
  187 + case TS_FSM_PERHAPS:
  188 + if (end_of_data() ||
  189 + !match_token(cur, data[block_idx]))
  190 + continue;
  191 + break;
  192 +
  193 + case TS_FSM_MULTI:
  194 + if (end_of_data())
  195 + goto no_match;
  196 +
  197 + if (!match_token(cur, data[block_idx]))
  198 + TOKEN_MISMATCH();
  199 +
  200 + block_idx++;
  201 + /* fall through */
  202 +
  203 + case TS_FSM_ANY:
  204 + if (next == NULL)
  205 + goto found_match;
  206 +
  207 + if (end_of_data())
  208 + continue;
  209 +
  210 + while (!match_token(next, data[block_idx])) {
  211 + if (!match_token(cur, data[block_idx]))
  212 + TOKEN_MISMATCH();
  213 + block_idx++;
  214 + if (end_of_data())
  215 + goto no_match;
  216 + }
  217 + continue;
  218 +
  219 + /*
  220 + * Optimization: Prefer small local loop over jumping
  221 + * back and forth until garbage at head is munched.
  222 + */
  223 + case TS_FSM_HEAD_IGNORE:
  224 + if (end_of_data())
  225 + continue;
  226 +
  227 + while (!match_token(next, data[block_idx])) {
  228 + /*
  229 + * Special case, don't start over upon
  230 + * a mismatch, give the user the
  231 + * chance to specify the type of data
  232 + * allowed to be ignored.
  233 + */
  234 + if (!match_token(cur, data[block_idx]))
  235 + goto no_match;
  236 +
  237 + block_idx++;
  238 + if (end_of_data())
  239 + goto no_match;
  240 + }
  241 +
  242 + match_start = consumed + block_idx;
  243 + continue;
  244 + }
  245 +
  246 + block_idx++;
  247 + }
  248 +
  249 + if (end_of_data())
  250 + goto found_match;
  251 +
  252 +no_match:
  253 + return UINT_MAX;
  254 +
  255 +found_match:
  256 + state->offset = consumed + block_idx;
  257 + return match_start;
  258 +}
  259 +
  260 +static struct ts_config *fsm_init(const void *pattern, unsigned int len,
  261 + int gfp_mask)
  262 +{
  263 + int i, err = -EINVAL;
  264 + struct ts_config *conf;
  265 + struct ts_fsm *fsm;
  266 + struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern;
  267 + unsigned int ntokens = len / sizeof(*tokens);
  268 + size_t priv_size = sizeof(*fsm) + len;
  269 +
  270 + if (len % sizeof(struct ts_fsm_token) || ntokens < 1)
  271 + goto errout;
  272 +
  273 + for (i = 0; i < ntokens; i++) {
  274 + struct ts_fsm_token *t = &tokens[i];
  275 +
  276 + if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX)
  277 + goto errout;
  278 +
  279 + if (t->recur == TS_FSM_HEAD_IGNORE &&
  280 + (i != 0 || i == (ntokens - 1)))
  281 + goto errout;
  282 + }
  283 +
  284 + conf = alloc_ts_config(priv_size, gfp_mask);
  285 + if (IS_ERR(conf))
  286 + return conf;
  287 +
  288 + fsm = ts_config_priv(conf);
  289 + fsm->ntokens = ntokens;
  290 + memcpy(fsm->tokens, pattern, len);
  291 +
  292 + for (i = 0; i < fsm->ntokens; i++) {
  293 + struct ts_fsm_token *t = &fsm->tokens[i];
  294 + t->type = token_map[t->type];
  295 + }
  296 +
  297 + return conf;
  298 +
  299 +errout:
  300 + return ERR_PTR(err);
  301 +}
  302 +
  303 +static void *fsm_get_pattern(struct ts_config *conf)
  304 +{
  305 + struct ts_fsm *fsm = ts_config_priv(conf);
  306 + return fsm->tokens;
  307 +}
  308 +
  309 +static unsigned int fsm_get_pattern_len(struct ts_config *conf)
  310 +{
  311 + struct ts_fsm *fsm = ts_config_priv(conf);
  312 + return fsm->ntokens * sizeof(struct ts_fsm_token);
  313 +}
  314 +
  315 +static struct ts_ops fsm_ops = {
  316 + .name = "fsm",
  317 + .find = fsm_find,
  318 + .init = fsm_init,
  319 + .get_pattern = fsm_get_pattern,
  320 + .get_pattern_len = fsm_get_pattern_len,
  321 + .owner = THIS_MODULE,
  322 + .list = LIST_HEAD_INIT(fsm_ops.list)
  323 +};
  324 +
  325 +static int __init init_fsm(void)
  326 +{
  327 + return textsearch_register(&fsm_ops);
  328 +}
  329 +
  330 +static void __exit exit_fsm(void)
  331 +{
  332 + textsearch_unregister(&fsm_ops);
  333 +}
  334 +
  335 +MODULE_LICENSE("GPL");
  336 +
  337 +module_init(init_fsm);
  338 +module_exit(exit_fsm);