[LIB]: Naive finite state machine based textsearch

A finite state machine consists of n states (struct ts_fsm_token) representing the pattern as a finite automation. The data is read sequentially on a octet basis. Every state token specifies the number of recurrences and the type of value accepted which can be either a specific character or ctype based set of characters. The available type of recurrences include 1, (0|1), [0 n], and [1 n]. The algorithm differs between strict/non-strict mode specyfing whether the pattern has to start at the first octect. Strict mode is enabled by default and can be disabled by inserting TS_FSM_HEAD_IGNORE as the first token in the chain. The runtime performance of the algorithm should be around O(n), however while in strict mode the average runtime can be better. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>

[LIB]: Naive finite state machine based textsearch
A finite state machine consists of n states (struct ts_fsm_token) representing the pattern as a finite automation. The data is read sequentially on a octet basis. Every state token specifies the number of recurrences and the type of value accepted which can be either a specific character or ctype based set of characters. The available type of recurrences include 1, (0|1), [0 n], and [1 n]. The algorithm differs between strict/non-strict mode specyfing whether the pattern has to start at the first octect. Strict mode is enabled by default and can be disabled by inserting TS_FSM_HEAD_IGNORE as the first token in the chain. The runtime performance of the algorithm should be around O(n), however while in strict mode the average runtime can be better. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
Thomas Graf · David S. Miller
1 parent df3fb93ad9
Showing 4 changed files with 398 additions and 0 deletions Side-by-side Diff
include/linux/textsearch_fsm.h
lib/Kconfig
lib/Makefile
lib/ts_fsm.c
+#ifndef __LINUX_TEXTSEARCH_FSM_H
+#define __LINUX_TEXTSEARCH_FSM_H
+
+#include <linux/types.h>
+
+enum {
+	TS_FSM_SPECIFIC,	/* specific character */
+	TS_FSM_WILDCARD,	/* any character */
+	TS_FSM_DIGIT,		/* isdigit() */
+	TS_FSM_XDIGIT,		/* isxdigit() */
+	TS_FSM_PRINT,		/* isprint() */
+	TS_FSM_ALPHA,		/* isalpha() */
+	TS_FSM_ALNUM,		/* isalnum() */
+	TS_FSM_ASCII,		/* isascii() */
+	TS_FSM_CNTRL,		/* iscntrl() */
+	TS_FSM_GRAPH,		/* isgraph() */
+	TS_FSM_LOWER,		/* islower() */
+	TS_FSM_UPPER,		/* isupper() */
+	TS_FSM_PUNCT,		/* ispunct() */
+	TS_FSM_SPACE,		/* isspace() */
+	__TS_FSM_TYPE_MAX,
+};
+#define TS_FSM_TYPE_MAX (__TS_FSM_TYPE_MAX - 1)
+
+enum {
+	TS_FSM_SINGLE,		/* 1 occurrence */
+	TS_FSM_PERHAPS,		/* 1 or 0 occurrence */
+	TS_FSM_ANY,		/* 0..n occurrences */
+	TS_FSM_MULTI,		/* 1..n occurrences */
+	TS_FSM_HEAD_IGNORE,	/* 0..n ignored occurrences at head */
+	__TS_FSM_RECUR_MAX,
+};
+#define TS_FSM_RECUR_MAX (__TS_FSM_RECUR_MAX - 1)
+
+/**
+ * struct ts_fsm_token - state machine token (state)
+ * @type: type of token
+ * @recur: number of recurrences
+ * @value: character value for TS_FSM_SPECIFIC
+ */
+struct ts_fsm_token
+{
+	__u16		type;
+	__u8		recur;
+	__u8		value;
+};
+
+#endif
@@ -80,5 +80,16 @@
 	  To compile this code as a module, choose M here: the
 	  module will be called ts_kmp.
  
+config TEXTSEARCH_FSM
+	depends on TEXTSEARCH
+	tristate "Finite state machine"
+	help
+	  Say Y here if you want to be able to search text using a
+	  naive finite state machine approach implementing a subset
+	  of regular expressions.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called ts_fsm.
+
 endmenu
@@ -38,6 +38,7 @@
  
 lib-$(CONFIG_TEXTSEARCH) += textsearch.o
 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
+obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
  
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
+/*
+ * lib/ts_fsm.c	   A naive finite state machine text search approach
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ *
+ * ==========================================================================
+ *
+ *   A finite state machine consists of n states (struct ts_fsm_token)
+ *   representing the pattern as a finite automation. The data is read
+ *   sequentially on a octet basis. Every state token specifies the number
+ *   of recurrences and the type of value accepted which can be either a
+ *   specific character or ctype based set of characters. The available
+ *   type of recurrences include 1, (0|1), [0 n], and [1 n].
+ *
+ *   The algorithm differs between strict/non-strict mode specyfing
+ *   whether the pattern has to start at the first octect. Strict mode
+ *   is enabled by default and can be disabled by inserting
+ *   TS_FSM_HEAD_IGNORE as the first token in the chain.
+ *
+ *   The runtime performance of the algorithm should be around O(n),
+ *   however while in strict mode the average runtime can be better.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/textsearch.h>
+#include <linux/textsearch_fsm.h>
+
+struct ts_fsm
+{
+	unsigned int		ntokens;
+	struct ts_fsm_token	tokens[0];
+};
+
+/* other values derived from ctype.h */
+#define _A		0x100 /* ascii */
+#define _W		0x200 /* wildcard */
+
+/* Map to _ctype flags and some magic numbers */
+static u16 token_map[TS_FSM_TYPE_MAX+1] = {
+	[TS_FSM_SPECIFIC] = 0,
+	[TS_FSM_WILDCARD] = _W,
+	[TS_FSM_CNTRL]	  = _C,
+	[TS_FSM_LOWER]	  = _L,
+	[TS_FSM_UPPER]	  = _U,
+	[TS_FSM_PUNCT]	  = _P,
+	[TS_FSM_SPACE]	  = _S,
+	[TS_FSM_DIGIT]	  = _D,
+	[TS_FSM_XDIGIT]	  = _D | _X,
+	[TS_FSM_ALPHA]	  = _U | _L,
+	[TS_FSM_ALNUM]	  = _U | _L | _D,
+	[TS_FSM_PRINT]	  = _P | _U | _L | _D | _SP,
+	[TS_FSM_GRAPH]	  = _P | _U | _L | _D,
+	[TS_FSM_ASCII]	  = _A,
+};
+
+static u16 token_lookup_tbl[256] = {
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*   0-  3 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*   4-  7 */
+_W|_A|_C,      _W|_A|_C|_S,  _W|_A|_C|_S,  _W|_A|_C|_S,		/*   8- 11 */
+_W|_A|_C|_S,   _W|_A|_C|_S,  _W|_A|_C,     _W|_A|_C,		/*  12- 15 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  16- 19 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  20- 23 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  24- 27 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  28- 31 */
+_W|_A|_S|_SP,  _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  32- 35 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  36- 39 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  40- 43 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  44- 47 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,		/*  48- 51 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,		/*  52- 55 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_P,     _W|_A|_P,		/*  56- 59 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  60- 63 */
+_W|_A|_P,      _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U|_X,		/*  64- 67 */
+_W|_A|_U|_X,   _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U,		/*  68- 71 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  72- 75 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  76- 79 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  80- 83 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  84- 87 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_P,		/*  88- 91 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  92- 95 */
+_W|_A|_P,      _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L|_X,		/*  96- 99 */
+_W|_A|_L|_X,   _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L,		/* 100-103 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 104-107 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 108-111 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 112-115 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 116-119 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_P,		/* 120-123 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_C,		/* 124-127 */
+_W,            _W,           _W,           _W,			/* 128-131 */
+_W,            _W,           _W,           _W,			/* 132-135 */
+_W,            _W,           _W,           _W,			/* 136-139 */
+_W,            _W,           _W,           _W,			/* 140-143 */
+_W,            _W,           _W,           _W,			/* 144-147 */
+_W,            _W,           _W,           _W,			/* 148-151 */
+_W,            _W,           _W,           _W,			/* 152-155 */
+_W,            _W,           _W,           _W,			/* 156-159 */
+_W|_S|_SP,     _W|_P,        _W|_P,        _W|_P,		/* 160-163 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 164-167 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 168-171 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 172-175 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 176-179 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 180-183 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 184-187 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 188-191 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 192-195 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 196-199 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 200-203 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 204-207 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 208-211 */
+_W|_U,         _W|_U,        _W|_U,        _W|_P,		/* 212-215 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 216-219 */
+_W|_U,         _W|_U,        _W|_U,        _W|_L,		/* 220-223 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 224-227 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 228-231 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 232-235 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 236-239 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 240-243 */
+_W|_L,         _W|_L,        _W|_L,        _W|_P,		/* 244-247 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 248-251 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L};		/* 252-255 */
+
+static inline int match_token(struct ts_fsm_token *t, u8 d)
+{
+	if (t->type)
+		return (token_lookup_tbl[d] & t->type) != 0;
+	else
+		return t->value == d;
+}
+
+static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state)
+{
+	struct ts_fsm *fsm = ts_config_priv(conf);
+	struct ts_fsm_token *cur = NULL, *next;
+	unsigned int match_start, block_idx = 0, tok_idx;
+	unsigned block_len = 0, strict, consumed = state->offset;
+	const u8 *data;
+
+#define GET_NEXT_BLOCK()		\
+({	consumed += block_idx;		\
+	block_idx = 0;			\
+	block_len = conf->get_next_block(consumed, &data, conf, state); })
+
+#define TOKEN_MISMATCH()		\
+	do {				\
+		if (strict)		\
+			goto no_match;	\
+		block_idx++;		\
+		goto startover;		\
+	} while(0)
+
+#define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK())
+
+	if (end_of_data())
+		goto no_match;
+
+	strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE;
+
+startover:
+	match_start = consumed + block_idx;
+
+	for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) {
+		cur = &fsm->tokens[tok_idx];
+
+		if (likely(tok_idx < (fsm->ntokens - 1)))
+			next = &fsm->tokens[tok_idx + 1];
+		else
+			next = NULL;
+
+		switch (cur->recur) {
+		case TS_FSM_SINGLE:
+			if (end_of_data())
+				goto no_match;
+
+			if (!match_token(cur, data[block_idx]))
+				TOKEN_MISMATCH();
+			break;
+
+		case TS_FSM_PERHAPS:
+			if (end_of_data() ||
+			    !match_token(cur, data[block_idx]))
+				continue;
+			break;
+
+		case TS_FSM_MULTI:
+			if (end_of_data())
+				goto no_match;
+
+			if (!match_token(cur, data[block_idx]))
+				TOKEN_MISMATCH();
+
+			block_idx++;
+			/* fall through */
+
+		case TS_FSM_ANY:
+			if (next == NULL)
+				goto found_match;
+
+			if (end_of_data())
+				continue;
+
+			while (!match_token(next, data[block_idx])) {
+				if (!match_token(cur, data[block_idx]))
+					TOKEN_MISMATCH();
+				block_idx++;
+				if (end_of_data())
+					goto no_match;
+			}
+			continue;
+
+		/*
+		 * Optimization: Prefer small local loop over jumping
+		 * back and forth until garbage at head is munched.
+		 */
+		case TS_FSM_HEAD_IGNORE:
+			if (end_of_data())
+				continue;
+
+			while (!match_token(next, data[block_idx])) {
+				/*
+				 * Special case, don't start over upon
+				 * a mismatch, give the user the
+				 * chance to specify the type of data
+				 * allowed to be ignored.
+				 */
+				if (!match_token(cur, data[block_idx]))
+					goto no_match;
+
+				block_idx++;
+				if (end_of_data())
+					goto no_match;
+			}
+
+			match_start = consumed + block_idx;
+			continue;
+		}
+
+		block_idx++;
+	}
+
+	if (end_of_data())
+		goto found_match;
+
+no_match:
+	return UINT_MAX;
+
+found_match:
+	state->offset = consumed + block_idx;
+	return match_start;
+}
+
+static struct ts_config *fsm_init(const void *pattern, unsigned int len,
+				     int gfp_mask)
+{
+	int i, err = -EINVAL;
+	struct ts_config *conf;
+	struct ts_fsm *fsm;
+	struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern;
+	unsigned int ntokens = len / sizeof(*tokens);
+	size_t priv_size = sizeof(*fsm) + len;
+
+	if (len  % sizeof(struct ts_fsm_token) || ntokens < 1)
+		goto errout;
+
+	for (i = 0; i < ntokens; i++) {
+		struct ts_fsm_token *t = &tokens[i];
+
+		if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX)
+			goto errout;
+
+		if (t->recur == TS_FSM_HEAD_IGNORE &&
+		    (i != 0 || i == (ntokens - 1)))
+			goto errout;
+	}
+
+	conf = alloc_ts_config(priv_size, gfp_mask);
+	if (IS_ERR(conf))
+		return conf;
+
+	fsm = ts_config_priv(conf);
+	fsm->ntokens = ntokens;
+	memcpy(fsm->tokens, pattern, len);
+
+	for (i = 0; i < fsm->ntokens; i++) {
+		struct ts_fsm_token *t = &fsm->tokens[i];
+		t->type = token_map[t->type];
+	}
+
+	return conf;
+
+errout:
+	return ERR_PTR(err);
+}
+
+static void *fsm_get_pattern(struct ts_config *conf)
+{
+	struct ts_fsm *fsm = ts_config_priv(conf);
+	return fsm->tokens;
+}
+
+static unsigned int fsm_get_pattern_len(struct ts_config *conf)
+{
+	struct ts_fsm *fsm = ts_config_priv(conf);
+	return fsm->ntokens * sizeof(struct ts_fsm_token);
+}
+
+static struct ts_ops fsm_ops = {
+	.name		  = "fsm",
+	.find		  = fsm_find,
+	.init		  = fsm_init,
+	.get_pattern	  = fsm_get_pattern,
+	.get_pattern_len  = fsm_get_pattern_len,
+	.owner		  = THIS_MODULE,
+	.list		  = LIST_HEAD_INIT(fsm_ops.list)
+};
+
+static int __init init_fsm(void)
+{
+	return textsearch_register(&fsm_ops);
+}
+
+static void __exit exit_fsm(void)
+{
+	textsearch_unregister(&fsm_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_fsm);
+module_exit(exit_fsm);
	1	+#ifndef __LINUX_TEXTSEARCH_FSM_H
	2	+#define __LINUX_TEXTSEARCH_FSM_H
	3	+
	4	+#include <linux/types.h>
	5	+
	6	+enum {
	7	+ TS_FSM_SPECIFIC, /* specific character */
	8	+ TS_FSM_WILDCARD, /* any character */
	9	+ TS_FSM_DIGIT, /* isdigit() */
	10	+ TS_FSM_XDIGIT, /* isxdigit() */
	11	+ TS_FSM_PRINT, /* isprint() */
	12	+ TS_FSM_ALPHA, /* isalpha() */
	13	+ TS_FSM_ALNUM, /* isalnum() */
	14	+ TS_FSM_ASCII, /* isascii() */
	15	+ TS_FSM_CNTRL, /* iscntrl() */
	16	+ TS_FSM_GRAPH, /* isgraph() */
	17	+ TS_FSM_LOWER, /* islower() */
	18	+ TS_FSM_UPPER, /* isupper() */
	19	+ TS_FSM_PUNCT, /* ispunct() */
	20	+ TS_FSM_SPACE, /* isspace() */
	21	+ __TS_FSM_TYPE_MAX,
	22	+};
	23	+#define TS_FSM_TYPE_MAX (__TS_FSM_TYPE_MAX - 1)
	24	+
	25	+enum {
	26	+ TS_FSM_SINGLE, /* 1 occurrence */
	27	+ TS_FSM_PERHAPS, /* 1 or 0 occurrence */
	28	+ TS_FSM_ANY, /* 0..n occurrences */
	29	+ TS_FSM_MULTI, /* 1..n occurrences */
	30	+ TS_FSM_HEAD_IGNORE, /* 0..n ignored occurrences at head */
	31	+ __TS_FSM_RECUR_MAX,
	32	+};
	33	+#define TS_FSM_RECUR_MAX (__TS_FSM_RECUR_MAX - 1)
	34	+
	35	+/**
	36	+ * struct ts_fsm_token - state machine token (state)
	37	+ * @type: type of token
	38	+ * @recur: number of recurrences
	39	+ * @value: character value for TS_FSM_SPECIFIC
	40	+ */
	41	+struct ts_fsm_token
	42	+{
	43	+ __u16 type;
	44	+ __u8 recur;
	45	+ __u8 value;
	46	+};
	47	+
	48	+#endif
...	...	@@ -80,5 +80,16 @@
80	80	To compile this code as a module, choose M here: the
81	81	module will be called ts_kmp.
82	82
	83	+config TEXTSEARCH_FSM
	84	+ depends on TEXTSEARCH
	85	+ tristate "Finite state machine"
	86	+ help
	87	+ Say Y here if you want to be able to search text using a
	88	+ naive finite state machine approach implementing a subset
	89	+ of regular expressions.
	90	+
	91	+ To compile this code as a module, choose M here: the
	92	+ module will be called ts_fsm.
	93	+
83	94	endmenu
...	...	@@ -38,6 +38,7 @@
38	38
39	39	lib-$(CONFIG_TEXTSEARCH) += textsearch.o
40	40	obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
	41	+obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
41	42
42	43	hostprogs-y := gen_crc32table
43	44	clean-files := crc32table.h
	1	+/*
	2	+ * lib/ts_fsm.c A naive finite state machine text search approach
	3	+ *
	4	+ * This program is free software; you can redistribute it and/or
	5	+ * modify it under the terms of the GNU General Public License
	6	+ * as published by the Free Software Foundation; either version
	7	+ * 2 of the License, or (at your option) any later version.
	8	+ *
	9	+ * Authors: Thomas Graf <tgraf@suug.ch>
	10	+ *
	11	+ * ==========================================================================
	12	+ *
	13	+ * A finite state machine consists of n states (struct ts_fsm_token)
	14	+ * representing the pattern as a finite automation. The data is read
	15	+ * sequentially on a octet basis. Every state token specifies the number
	16	+ * of recurrences and the type of value accepted which can be either a
	17	+ * specific character or ctype based set of characters. The available
	18	+ * type of recurrences include 1, (0\|1), [0 n], and [1 n].
	19	+ *
	20	+ * The algorithm differs between strict/non-strict mode specyfing
	21	+ * whether the pattern has to start at the first octect. Strict mode
	22	+ * is enabled by default and can be disabled by inserting
	23	+ * TS_FSM_HEAD_IGNORE as the first token in the chain.
	24	+ *
	25	+ * The runtime performance of the algorithm should be around O(n),
	26	+ * however while in strict mode the average runtime can be better.
	27	+ */
	28	+
	29	+#include <linux/config.h>
	30	+#include <linux/module.h>
	31	+#include <linux/types.h>
	32	+#include <linux/string.h>
	33	+#include <linux/ctype.h>
	34	+#include <linux/textsearch.h>
	35	+#include <linux/textsearch_fsm.h>
	36	+
	37	+struct ts_fsm
	38	+{
	39	+ unsigned int ntokens;
	40	+ struct ts_fsm_token tokens[0];
	41	+};
	42	+
	43	+/* other values derived from ctype.h */
	44	+#define _A 0x100 /* ascii */
	45	+#define _W 0x200 /* wildcard */
	46	+
	47	+/* Map to _ctype flags and some magic numbers */
	48	+static u16 token_map[TS_FSM_TYPE_MAX+1] = {
	49	+ [TS_FSM_SPECIFIC] = 0,
	50	+ [TS_FSM_WILDCARD] = _W,
	51	+ [TS_FSM_CNTRL] = _C,
	52	+ [TS_FSM_LOWER] = _L,
	53	+ [TS_FSM_UPPER] = _U,
	54	+ [TS_FSM_PUNCT] = _P,
	55	+ [TS_FSM_SPACE] = _S,
	56	+ [TS_FSM_DIGIT] = _D,
	57	+ [TS_FSM_XDIGIT] = _D \| _X,
	58	+ [TS_FSM_ALPHA] = _U \| _L,
	59	+ [TS_FSM_ALNUM] = _U \| _L \| _D,
	60	+ [TS_FSM_PRINT] = _P \| _U \| _L \| _D \| _SP,
	61	+ [TS_FSM_GRAPH] = _P \| _U \| _L \| _D,
	62	+ [TS_FSM_ASCII] = _A,
	63	+};
	64	+
	65	+static u16 token_lookup_tbl[256] = {
	66	+_W\|_A\|_C, _W\|_A\|_C, _W\|_A\|_C, _W\|_A\|_C, /* 0- 3 */
	67	+_W\|_A\|_C, _W\|_A\|_C, _W\|_A\|_C, _W\|_A\|_C, /* 4- 7 */
	68	+_W\|_A\|_C, _W\|_A\|_C\|_S, _W\|_A\|_C\|_S, _W\|_A\|_C\|_S, /* 8- 11 */
	69	+_W\|_A\|_C\|_S, _W\|_A\|_C\|_S, _W\|_A\|_C, _W\|_A\|_C, /* 12- 15 */
	70	+_W\|_A\|_C, _W\|_A\|_C, _W\|_A\|_C, _W\|_A\|_C, /* 16- 19 */
	71	+_W\|_A\|_C, _W\|_A\|_C, _W\|_A\|_C, _W\|_A\|_C, /* 20- 23 */
	72	+_W\|_A\|_C, _W\|_A\|_C, _W\|_A\|_C, _W\|_A\|_C, /* 24- 27 */
	73	+_W\|_A\|_C, _W\|_A\|_C, _W\|_A\|_C, _W\|_A\|_C, /* 28- 31 */
	74	+_W\|_A\|_S\|_SP, _W\|_A\|_P, _W\|_A\|_P, _W\|_A\|_P, /* 32- 35 */
	75	+_W\|_A\|_P, _W\|_A\|_P, _W\|_A\|_P, _W\|_A\|_P, /* 36- 39 */
	76	+_W\|_A\|_P, _W\|_A\|_P, _W\|_A\|_P, _W\|_A\|_P, /* 40- 43 */
	77	+_W\|_A\|_P, _W\|_A\|_P, _W\|_A\|_P, _W\|_A\|_P, /* 44- 47 */
	78	+_W\|_A\|_D, _W\|_A\|_D, _W\|_A\|_D, _W\|_A\|_D, /* 48- 51 */
	79	+_W\|_A\|_D, _W\|_A\|_D, _W\|_A\|_D, _W\|_A\|_D, /* 52- 55 */
	80	+_W\|_A\|_D, _W\|_A\|_D, _W\|_A\|_P, _W\|_A\|_P, /* 56- 59 */
	81	+_W\|_A\|_P, _W\|_A\|_P, _W\|_A\|_P, _W\|_A\|_P, /* 60- 63 */
	82	+_W\|_A\|_P, _W\|_A\|_U\|_X, _W\|_A\|_U\|_X, _W\|_A\|_U\|_X, /* 64- 67 */
	83	+_W\|_A\|_U\|_X, _W\|_A\|_U\|_X, _W\|_A\|_U\|_X, _W\|_A\|_U, /* 68- 71 */
	84	+_W\|_A\|_U, _W\|_A\|_U, _W\|_A\|_U, _W\|_A\|_U, /* 72- 75 */
	85	+_W\|_A\|_U, _W\|_A\|_U, _W\|_A\|_U, _W\|_A\|_U, /* 76- 79 */
	86	+_W\|_A\|_U, _W\|_A\|_U, _W\|_A\|_U, _W\|_A\|_U, /* 80- 83 */
	87	+_W\|_A\|_U, _W\|_A\|_U, _W\|_A\|_U, _W\|_A\|_U, /* 84- 87 */
	88	+_W\|_A\|_U, _W\|_A\|_U, _W\|_A\|_U, _W\|_A\|_P, /* 88- 91 */
	89	+_W\|_A\|_P, _W\|_A\|_P, _W\|_A\|_P, _W\|_A\|_P, /* 92- 95 */
	90	+_W\|_A\|_P, _W\|_A\|_L\|_X, _W\|_A\|_L\|_X, _W\|_A\|_L\|_X, /* 96- 99 */
	91	+_W\|_A\|_L\|_X, _W\|_A\|_L\|_X, _W\|_A\|_L\|_X, _W\|_A\|_L, /* 100-103 */
	92	+_W\|_A\|_L, _W\|_A\|_L, _W\|_A\|_L, _W\|_A\|_L, /* 104-107 */
	93	+_W\|_A\|_L, _W\|_A\|_L, _W\|_A\|_L, _W\|_A\|_L, /* 108-111 */
	94	+_W\|_A\|_L, _W\|_A\|_L, _W\|_A\|_L, _W\|_A\|_L, /* 112-115 */
	95	+_W\|_A\|_L, _W\|_A\|_L, _W\|_A\|_L, _W\|_A\|_L, /* 116-119 */
	96	+_W\|_A\|_L, _W\|_A\|_L, _W\|_A\|_L, _W\|_A\|_P, /* 120-123 */
	97	+_W\|_A\|_P, _W\|_A\|_P, _W\|_A\|_P, _W\|_A\|_C, /* 124-127 */
	98	+_W, _W, _W, _W, /* 128-131 */
	99	+_W, _W, _W, _W, /* 132-135 */
	100	+_W, _W, _W, _W, /* 136-139 */
	101	+_W, _W, _W, _W, /* 140-143 */
	102	+_W, _W, _W, _W, /* 144-147 */
	103	+_W, _W, _W, _W, /* 148-151 */
	104	+_W, _W, _W, _W, /* 152-155 */
	105	+_W, _W, _W, _W, /* 156-159 */
	106	+_W\|_S\|_SP, _W\|_P, _W\|_P, _W\|_P, /* 160-163 */
	107	+_W\|_P, _W\|_P, _W\|_P, _W\|_P, /* 164-167 */
	108	+_W\|_P, _W\|_P, _W\|_P, _W\|_P, /* 168-171 */
	109	+_W\|_P, _W\|_P, _W\|_P, _W\|_P, /* 172-175 */
	110	+_W\|_P, _W\|_P, _W\|_P, _W\|_P, /* 176-179 */
	111	+_W\|_P, _W\|_P, _W\|_P, _W\|_P, /* 180-183 */
	112	+_W\|_P, _W\|_P, _W\|_P, _W\|_P, /* 184-187 */
	113	+_W\|_P, _W\|_P, _W\|_P, _W\|_P, /* 188-191 */
	114	+_W\|_U, _W\|_U, _W\|_U, _W\|_U, /* 192-195 */
	115	+_W\|_U, _W\|_U, _W\|_U, _W\|_U, /* 196-199 */
	116	+_W\|_U, _W\|_U, _W\|_U, _W\|_U, /* 200-203 */
	117	+_W\|_U, _W\|_U, _W\|_U, _W\|_U, /* 204-207 */
	118	+_W\|_U, _W\|_U, _W\|_U, _W\|_U, /* 208-211 */
	119	+_W\|_U, _W\|_U, _W\|_U, _W\|_P, /* 212-215 */
	120	+_W\|_U, _W\|_U, _W\|_U, _W\|_U, /* 216-219 */
	121	+_W\|_U, _W\|_U, _W\|_U, _W\|_L, /* 220-223 */
	122	+_W\|_L, _W\|_L, _W\|_L, _W\|_L, /* 224-227 */
	123	+_W\|_L, _W\|_L, _W\|_L, _W\|_L, /* 228-231 */
	124	+_W\|_L, _W\|_L, _W\|_L, _W\|_L, /* 232-235 */
	125	+_W\|_L, _W\|_L, _W\|_L, _W\|_L, /* 236-239 */
	126	+_W\|_L, _W\|_L, _W\|_L, _W\|_L, /* 240-243 */
	127	+_W\|_L, _W\|_L, _W\|_L, _W\|_P, /* 244-247 */
	128	+_W\|_L, _W\|_L, _W\|_L, _W\|_L, /* 248-251 */
	129	+_W\|_L, _W\|_L, _W\|_L, _W\|_L}; /* 252-255 */
	130	+
	131	+static inline int match_token(struct ts_fsm_token *t, u8 d)
	132	+{
	133	+ if (t->type)
	134	+ return (token_lookup_tbl[d] & t->type) != 0;
	135	+ else
	136	+ return t->value == d;
	137	+}
	138	+
	139	+static unsigned int fsm_find(struct ts_config conf, struct ts_state state)
	140	+{
	141	+ struct ts_fsm *fsm = ts_config_priv(conf);
	142	+ struct ts_fsm_token cur = NULL, next;
	143	+ unsigned int match_start, block_idx = 0, tok_idx;
	144	+ unsigned block_len = 0, strict, consumed = state->offset;
	145	+ const u8 *data;
	146	+
	147	+#define GET_NEXT_BLOCK() \
	148	+({ consumed += block_idx; \
	149	+ block_idx = 0; \
	150	+ block_len = conf->get_next_block(consumed, &data, conf, state); })
	151	+
	152	+#define TOKEN_MISMATCH() \
	153	+ do { \
	154	+ if (strict) \
	155	+ goto no_match; \
	156	+ block_idx++; \
	157	+ goto startover; \
	158	+ } while(0)
	159	+
	160	+#define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK())
	161	+
	162	+ if (end_of_data())
	163	+ goto no_match;
	164	+
	165	+ strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE;
	166	+
	167	+startover:
	168	+ match_start = consumed + block_idx;
	169	+
	170	+ for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) {
	171	+ cur = &fsm->tokens[tok_idx];
	172	+
	173	+ if (likely(tok_idx < (fsm->ntokens - 1)))
	174	+ next = &fsm->tokens[tok_idx + 1];
	175	+ else
	176	+ next = NULL;
	177	+
	178	+ switch (cur->recur) {
	179	+ case TS_FSM_SINGLE:
	180	+ if (end_of_data())
	181	+ goto no_match;
	182	+
	183	+ if (!match_token(cur, data[block_idx]))
	184	+ TOKEN_MISMATCH();
	185	+ break;
	186	+
	187	+ case TS_FSM_PERHAPS:
	188	+ if (end_of_data() \|\|
	189	+ !match_token(cur, data[block_idx]))
	190	+ continue;
	191	+ break;
	192	+
	193	+ case TS_FSM_MULTI:
	194	+ if (end_of_data())
	195	+ goto no_match;
	196	+
	197	+ if (!match_token(cur, data[block_idx]))
	198	+ TOKEN_MISMATCH();
	199	+
	200	+ block_idx++;
	201	+ /* fall through */
	202	+
	203	+ case TS_FSM_ANY:
	204	+ if (next == NULL)
	205	+ goto found_match;
	206	+
	207	+ if (end_of_data())
	208	+ continue;
	209	+
	210	+ while (!match_token(next, data[block_idx])) {
	211	+ if (!match_token(cur, data[block_idx]))
	212	+ TOKEN_MISMATCH();
	213	+ block_idx++;
	214	+ if (end_of_data())
	215	+ goto no_match;
	216	+ }
	217	+ continue;
	218	+
	219	+ /*
	220	+ * Optimization: Prefer small local loop over jumping
	221	+ * back and forth until garbage at head is munched.
	222	+ */
	223	+ case TS_FSM_HEAD_IGNORE:
	224	+ if (end_of_data())
	225	+ continue;
	226	+
	227	+ while (!match_token(next, data[block_idx])) {
	228	+ /*
	229	+ * Special case, don't start over upon
	230	+ * a mismatch, give the user the
	231	+ * chance to specify the type of data
	232	+ * allowed to be ignored.
	233	+ */
	234	+ if (!match_token(cur, data[block_idx]))
	235	+ goto no_match;
	236	+
	237	+ block_idx++;
	238	+ if (end_of_data())
	239	+ goto no_match;
	240	+ }
	241	+
	242	+ match_start = consumed + block_idx;
	243	+ continue;
	244	+ }
	245	+
	246	+ block_idx++;
	247	+ }
	248	+
	249	+ if (end_of_data())
	250	+ goto found_match;
	251	+
	252	+no_match:
	253	+ return UINT_MAX;
	254	+
	255	+found_match:
	256	+ state->offset = consumed + block_idx;
	257	+ return match_start;
	258	+}
	259	+
	260	+static struct ts_config fsm_init(const void pattern, unsigned int len,
	261	+ int gfp_mask)
	262	+{
	263	+ int i, err = -EINVAL;
	264	+ struct ts_config *conf;
	265	+ struct ts_fsm *fsm;
	266	+ struct ts_fsm_token tokens = (struct ts_fsm_token ) pattern;
	267	+ unsigned int ntokens = len / sizeof(*tokens);
	268	+ size_t priv_size = sizeof(*fsm) + len;
	269	+
	270	+ if (len % sizeof(struct ts_fsm_token) \|\| ntokens < 1)
	271	+ goto errout;
	272	+
	273	+ for (i = 0; i < ntokens; i++) {
	274	+ struct ts_fsm_token *t = &tokens[i];
	275	+
	276	+ if (t->type > TS_FSM_TYPE_MAX \|\| t->recur > TS_FSM_RECUR_MAX)
	277	+ goto errout;
	278	+
	279	+ if (t->recur == TS_FSM_HEAD_IGNORE &&
	280	+ (i != 0 \|\| i == (ntokens - 1)))
	281	+ goto errout;
	282	+ }
	283	+
	284	+ conf = alloc_ts_config(priv_size, gfp_mask);
	285	+ if (IS_ERR(conf))
	286	+ return conf;
	287	+
	288	+ fsm = ts_config_priv(conf);
	289	+ fsm->ntokens = ntokens;
	290	+ memcpy(fsm->tokens, pattern, len);
	291	+
	292	+ for (i = 0; i < fsm->ntokens; i++) {
	293	+ struct ts_fsm_token *t = &fsm->tokens[i];
	294	+ t->type = token_map[t->type];
	295	+ }
	296	+
	297	+ return conf;
	298	+
	299	+errout:
	300	+ return ERR_PTR(err);
	301	+}
	302	+
	303	+static void fsm_get_pattern(struct ts_config conf)
	304	+{
	305	+ struct ts_fsm *fsm = ts_config_priv(conf);
	306	+ return fsm->tokens;
	307	+}
	308	+
	309	+static unsigned int fsm_get_pattern_len(struct ts_config *conf)
	310	+{
	311	+ struct ts_fsm *fsm = ts_config_priv(conf);
	312	+ return fsm->ntokens * sizeof(struct ts_fsm_token);
	313	+}
	314	+
	315	+static struct ts_ops fsm_ops = {
	316	+ .name = "fsm",
	317	+ .find = fsm_find,
	318	+ .init = fsm_init,
	319	+ .get_pattern = fsm_get_pattern,
	320	+ .get_pattern_len = fsm_get_pattern_len,
	321	+ .owner = THIS_MODULE,
	322	+ .list = LIST_HEAD_INIT(fsm_ops.list)
	323	+};
	324	+
	325	+static int __init init_fsm(void)
	326	+{
	327	+ return textsearch_register(&fsm_ops);
	328	+}
	329	+
	330	+static void __exit exit_fsm(void)
	331	+{
	332	+ textsearch_unregister(&fsm_ops);
	333	+}
	334	+
	335	+MODULE_LICENSE("GPL");
	336	+
	337	+module_init(init_fsm);
	338	+module_exit(exit_fsm);