Blame view

lib/ts_bm.c 5.28 KB
8082e4ed0   Pablo Neira Ayuso   [LIB]: Boyer-Moor...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
  /*
   * lib/ts_bm.c		Boyer-Moore text search implementation
   *
   *		This program is free software; you can redistribute it and/or
   *		modify it under the terms of the GNU General Public License
   *		as published by the Free Software Foundation; either version
   *		2 of the License, or (at your option) any later version.
   *
   * Authors:	Pablo Neira Ayuso <pablo@eurodev.net>
   *
   * ==========================================================================
   * 
   *   Implements Boyer-Moore string matching algorithm:
   *
   *   [1] A Fast String Searching Algorithm, R.S. Boyer and Moore.
   *       Communications of the Association for Computing Machinery, 
   *       20(10), 1977, pp. 762-772.
   *       http://www.cs.utexas.edu/users/moore/publications/fstrpos.pdf
   *
   *   [2] Handbook of Exact String Matching Algorithms, Thierry Lecroq, 2004
   *       http://www-igm.univ-mlv.fr/~lecroq/string/string.pdf
   *
   *   Note: Since Boyer-Moore (BM) performs searches for matchings from right 
   *   to left, it's still possible that a matching could be spread over 
   *   multiple blocks, in that case this algorithm won't find any coincidence.
   *   
   *   If you're willing to ensure that such thing won't ever happen, use the
   *   Knuth-Pratt-Morris (KMP) implementation instead. In conclusion, choose 
   *   the proper string search algorithm depending on your setting. 
   *
   *   Say you're using the textsearch infrastructure for filtering, NIDS or 
   *   any similar security focused purpose, then go KMP. Otherwise, if you 
   *   really care about performance, say you're classifying packets to apply
   *   Quality of Service (QoS) policies, and you don't mind about possible
   *   matchings spread over multiple fragments, then go BM.
   */
8082e4ed0   Pablo Neira Ayuso   [LIB]: Boyer-Moor...
37
38
39
40
  #include <linux/kernel.h>
  #include <linux/module.h>
  #include <linux/types.h>
  #include <linux/string.h>
3b76d0819   Joonwoo Park   textsearch: ts_bm...
41
  #include <linux/ctype.h>
8082e4ed0   Pablo Neira Ayuso   [LIB]: Boyer-Moor...
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
  #include <linux/textsearch.h>
  
  /* Alphabet size, use ASCII */
  #define ASIZE 256
  
  #if 0
  #define DEBUGP printk
  #else
  #define DEBUGP(args, format...)
  #endif
  
  struct ts_bm
  {
  	u8 *		pattern;
  	unsigned int	patlen;
  	unsigned int 	bad_shift[ASIZE];
  	unsigned int	good_shift[0];
  };
  
  static unsigned int bm_find(struct ts_config *conf, struct ts_state *state)
  {
  	struct ts_bm *bm = ts_config_priv(conf);
  	unsigned int i, text_len, consumed = state->offset;
  	const u8 *text;
aebb6a849   Joonwoo Park   textsearch: fix B...
66
  	int shift = bm->patlen - 1, bs;
3b76d0819   Joonwoo Park   textsearch: ts_bm...
67
  	const u8 icase = conf->flags & TS_IGNORECASE;
8082e4ed0   Pablo Neira Ayuso   [LIB]: Boyer-Moor...
68
69
70
71
72
73
74
75
76
77
78
79
  
  	for (;;) {
  		text_len = conf->get_next_block(consumed, &text, conf, state);
  
  		if (unlikely(text_len == 0))
  			break;
  
  		while (shift < text_len) {
  			DEBUGP("Searching in position %d (%c)
  ", 
  				shift, text[shift]);
  			for (i = 0; i < bm->patlen; i++) 
3b76d0819   Joonwoo Park   textsearch: ts_bm...
80
81
82
  				if ((icase ? toupper(text[shift-i])
  				    : text[shift-i])
  					!= bm->pattern[bm->patlen-1-i])
8082e4ed0   Pablo Neira Ayuso   [LIB]: Boyer-Moor...
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
  				     goto next;
  
  			/* London calling... */
  			DEBUGP("found!
  ");
  			return consumed += (shift-(bm->patlen-1));
  
  next:			bs = bm->bad_shift[text[shift-i]];
  
  			/* Now jumping to... */
  			shift = max_t(int, shift-i+bs, shift+bm->good_shift[i]);
  		}
  		consumed += text_len;
  	}
  
  	return UINT_MAX;
  }
3f330317a   Pablo Neira Ayuso   [TEXTSEARCH]: Fix...
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
  static int subpattern(u8 *pattern, int i, int j, int g)
  {
  	int x = i+g-1, y = j+g-1, ret = 0;
  
  	while(pattern[x--] == pattern[y--]) {
  		if (y < 0) {
  			ret = 1;
  			break;
  		}
  		if (--g == 0) {
  			ret = pattern[i-1] != pattern[j-1];
  			break;
  		}
  	}
  
  	return ret;
  }
3b76d0819   Joonwoo Park   textsearch: ts_bm...
117
  static void compute_prefix_tbl(struct ts_bm *bm, int flags)
8082e4ed0   Pablo Neira Ayuso   [LIB]: Boyer-Moor...
118
  {
3f330317a   Pablo Neira Ayuso   [TEXTSEARCH]: Fix...
119
  	int i, j, g;
8082e4ed0   Pablo Neira Ayuso   [LIB]: Boyer-Moor...
120
121
  
  	for (i = 0; i < ASIZE; i++)
3ffaa8c7c   Michael Rash   [TEXTSEARCH]: Fix...
122
  		bm->bad_shift[i] = bm->patlen;
3b76d0819   Joonwoo Park   textsearch: ts_bm...
123
  	for (i = 0; i < bm->patlen - 1; i++) {
3ffaa8c7c   Michael Rash   [TEXTSEARCH]: Fix...
124
  		bm->bad_shift[bm->pattern[i]] = bm->patlen - 1 - i;
3b76d0819   Joonwoo Park   textsearch: ts_bm...
125
126
127
128
  		if (flags & TS_IGNORECASE)
  			bm->bad_shift[tolower(bm->pattern[i])]
  			    = bm->patlen - 1 - i;
  	}
8082e4ed0   Pablo Neira Ayuso   [LIB]: Boyer-Moor...
129
130
131
  
  	/* Compute the good shift array, used to match reocurrences 
  	 * of a subpattern */
8082e4ed0   Pablo Neira Ayuso   [LIB]: Boyer-Moor...
132
133
134
  	bm->good_shift[0] = 1;
  	for (i = 1; i < bm->patlen; i++)
  		bm->good_shift[i] = bm->patlen;
3f330317a   Pablo Neira Ayuso   [TEXTSEARCH]: Fix...
135
136
137
138
139
140
          for (i = bm->patlen-1, g = 1; i > 0; g++, i--) {
  		for (j = i-1; j >= 1-g ; j--)
  			if (subpattern(bm->pattern, i, j, g)) {
  				bm->good_shift[g] = bm->patlen-j-g;
  				break;
  			}
8082e4ed0   Pablo Neira Ayuso   [LIB]: Boyer-Moor...
141
142
143
144
  	}
  }
  
  static struct ts_config *bm_init(const void *pattern, unsigned int len,
3b76d0819   Joonwoo Park   textsearch: ts_bm...
145
  				 gfp_t gfp_mask, int flags)
8082e4ed0   Pablo Neira Ayuso   [LIB]: Boyer-Moor...
146
147
148
  {
  	struct ts_config *conf;
  	struct ts_bm *bm;
3b76d0819   Joonwoo Park   textsearch: ts_bm...
149
  	int i;
8082e4ed0   Pablo Neira Ayuso   [LIB]: Boyer-Moor...
150
151
152
153
154
155
  	unsigned int prefix_tbl_len = len * sizeof(unsigned int);
  	size_t priv_size = sizeof(*bm) + len + prefix_tbl_len;
  
  	conf = alloc_ts_config(priv_size, gfp_mask);
  	if (IS_ERR(conf))
  		return conf;
3b76d0819   Joonwoo Park   textsearch: ts_bm...
156
  	conf->flags = flags;
8082e4ed0   Pablo Neira Ayuso   [LIB]: Boyer-Moor...
157
158
159
  	bm = ts_config_priv(conf);
  	bm->patlen = len;
  	bm->pattern = (u8 *) bm->good_shift + prefix_tbl_len;
3b76d0819   Joonwoo Park   textsearch: ts_bm...
160
161
162
163
164
165
  	if (flags & TS_IGNORECASE)
  		for (i = 0; i < len; i++)
  			bm->pattern[i] = toupper(((u8 *)pattern)[i]);
  	else
  		memcpy(bm->pattern, pattern, len);
  	compute_prefix_tbl(bm, flags);
8082e4ed0   Pablo Neira Ayuso   [LIB]: Boyer-Moor...
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
  
  	return conf;
  }
  
  static void *bm_get_pattern(struct ts_config *conf)
  {
  	struct ts_bm *bm = ts_config_priv(conf);
  	return bm->pattern;
  }
  
  static unsigned int bm_get_pattern_len(struct ts_config *conf)
  {
  	struct ts_bm *bm = ts_config_priv(conf);
  	return bm->patlen;
  }
  
  static struct ts_ops bm_ops = {
  	.name		  = "bm",
  	.find		  = bm_find,
  	.init		  = bm_init,
  	.get_pattern	  = bm_get_pattern,
  	.get_pattern_len  = bm_get_pattern_len,
  	.owner		  = THIS_MODULE,
  	.list		  = LIST_HEAD_INIT(bm_ops.list)
  };
  
  static int __init init_bm(void)
  {
  	return textsearch_register(&bm_ops);
  }
  
  static void __exit exit_bm(void)
  {
  	textsearch_unregister(&bm_ops);
  }
  
  MODULE_LICENSE("GPL");
  
  module_init(init_bm);
  module_exit(exit_bm);