[LIB]: Naive finite state machine based textsearch
Thomas Graf [Fri, 24 Jun 2005 03:59:16 +0000 (20:59 -0700)]
A finite state machine consists of n states (struct ts_fsm_token)
representing the pattern as a finite automation. The data is read
sequentially on a octet basis. Every state token specifies the number
of recurrences and the type of value accepted which can be either a
specific character or ctype based set of characters. The available
type of recurrences include 1, (0|1), [0 n], and [1 n].

The algorithm differs between strict/non-strict mode specyfing
whether the pattern has to start at the first octect. Strict mode
is enabled by default and can be disabled by inserting
TS_FSM_HEAD_IGNORE as the first token in the chain.

The runtime performance of the algorithm should be around O(n),
however while in strict mode the average runtime can be better.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>

include/linux/textsearch_fsm.h [new file with mode: 0644]
lib/Kconfig
lib/Makefile
lib/ts_fsm.c [new file with mode: 0644]

diff --git a/include/linux/textsearch_fsm.h b/include/linux/textsearch_fsm.h
new file mode 100644 (file)
index 0000000..fdfa078
--- /dev/null
@@ -0,0 +1,48 @@
+#ifndef __LINUX_TEXTSEARCH_FSM_H
+#define __LINUX_TEXTSEARCH_FSM_H
+
+#include <linux/types.h>
+
+enum {
+       TS_FSM_SPECIFIC,        /* specific character */
+       TS_FSM_WILDCARD,        /* any character */
+       TS_FSM_DIGIT,           /* isdigit() */
+       TS_FSM_XDIGIT,          /* isxdigit() */
+       TS_FSM_PRINT,           /* isprint() */
+       TS_FSM_ALPHA,           /* isalpha() */
+       TS_FSM_ALNUM,           /* isalnum() */
+       TS_FSM_ASCII,           /* isascii() */
+       TS_FSM_CNTRL,           /* iscntrl() */
+       TS_FSM_GRAPH,           /* isgraph() */
+       TS_FSM_LOWER,           /* islower() */
+       TS_FSM_UPPER,           /* isupper() */
+       TS_FSM_PUNCT,           /* ispunct() */
+       TS_FSM_SPACE,           /* isspace() */
+       __TS_FSM_TYPE_MAX,
+};
+#define TS_FSM_TYPE_MAX (__TS_FSM_TYPE_MAX - 1)
+
+enum {
+       TS_FSM_SINGLE,          /* 1 occurrence */
+       TS_FSM_PERHAPS,         /* 1 or 0 occurrence */
+       TS_FSM_ANY,             /* 0..n occurrences */
+       TS_FSM_MULTI,           /* 1..n occurrences */
+       TS_FSM_HEAD_IGNORE,     /* 0..n ignored occurrences at head */
+       __TS_FSM_RECUR_MAX,
+};
+#define TS_FSM_RECUR_MAX (__TS_FSM_RECUR_MAX - 1)
+
+/**
+ * struct ts_fsm_token - state machine token (state)
+ * @type: type of token
+ * @recur: number of recurrences
+ * @value: character value for TS_FSM_SPECIFIC
+ */
+struct ts_fsm_token
+{
+       __u16           type;
+       __u8            recur;
+       __u8            value;
+};
+
+#endif
index 16b8fa2..455833a 100644 (file)
@@ -80,4 +80,15 @@ config TEXTSEARCH_KMP
          To compile this code as a module, choose M here: the
          module will be called ts_kmp.
 
+config TEXTSEARCH_FSM
+       depends on TEXTSEARCH
+       tristate "Finite state machine"
+       help
+         Say Y here if you want to be able to search text using a
+         naive finite state machine approach implementing a subset
+         of regular expressions.
+
+         To compile this code as a module, choose M here: the
+         module will be called ts_fsm.
+
 endmenu
index 6cdb10f..7f6eda4 100644 (file)
@@ -38,6 +38,7 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 
 lib-$(CONFIG_TEXTSEARCH) += textsearch.o
 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
+obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
 
 hostprogs-y    := gen_crc32table
 clean-files    := crc32table.h
diff --git a/lib/ts_fsm.c b/lib/ts_fsm.c
new file mode 100644 (file)
index 0000000..d27c0a0
--- /dev/null
@@ -0,0 +1,338 @@
+/*
+ * lib/ts_fsm.c           A naive finite state machine text search approach
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or (at your option) any later version.
+ *
+ * Authors:    Thomas Graf <tgraf@suug.ch>
+ *
+ * ==========================================================================
+ *
+ *   A finite state machine consists of n states (struct ts_fsm_token)
+ *   representing the pattern as a finite automation. The data is read
+ *   sequentially on a octet basis. Every state token specifies the number
+ *   of recurrences and the type of value accepted which can be either a
+ *   specific character or ctype based set of characters. The available
+ *   type of recurrences include 1, (0|1), [0 n], and [1 n].
+ *
+ *   The algorithm differs between strict/non-strict mode specyfing
+ *   whether the pattern has to start at the first octect. Strict mode
+ *   is enabled by default and can be disabled by inserting
+ *   TS_FSM_HEAD_IGNORE as the first token in the chain.
+ *
+ *   The runtime performance of the algorithm should be around O(n),
+ *   however while in strict mode the average runtime can be better.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/textsearch.h>
+#include <linux/textsearch_fsm.h>
+
+struct ts_fsm
+{
+       unsigned int            ntokens;
+       struct ts_fsm_token     tokens[0];
+};
+
+/* other values derived from ctype.h */
+#define _A             0x100 /* ascii */
+#define _W             0x200 /* wildcard */
+
+/* Map to _ctype flags and some magic numbers */
+static u16 token_map[TS_FSM_TYPE_MAX+1] = {
+       [TS_FSM_SPECIFIC] = 0,
+       [TS_FSM_WILDCARD] = _W,
+       [TS_FSM_CNTRL]    = _C,
+       [TS_FSM_LOWER]    = _L,
+       [TS_FSM_UPPER]    = _U,
+       [TS_FSM_PUNCT]    = _P,
+       [TS_FSM_SPACE]    = _S,
+       [TS_FSM_DIGIT]    = _D,
+       [TS_FSM_XDIGIT]   = _D | _X,
+       [TS_FSM_ALPHA]    = _U | _L,
+       [TS_FSM_ALNUM]    = _U | _L | _D,
+       [TS_FSM_PRINT]    = _P | _U | _L | _D | _SP,
+       [TS_FSM_GRAPH]    = _P | _U | _L | _D,
+       [TS_FSM_ASCII]    = _A,
+};
+
+static u16 token_lookup_tbl[256] = {
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,           /*   0-  3 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,           /*   4-  7 */
+_W|_A|_C,      _W|_A|_C|_S,  _W|_A|_C|_S,  _W|_A|_C|_S,                /*   8- 11 */
+_W|_A|_C|_S,   _W|_A|_C|_S,  _W|_A|_C,     _W|_A|_C,           /*  12- 15 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,           /*  16- 19 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,           /*  20- 23 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,           /*  24- 27 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,           /*  28- 31 */
+_W|_A|_S|_SP,  _W|_A|_P,     _W|_A|_P,     _W|_A|_P,           /*  32- 35 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,           /*  36- 39 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,           /*  40- 43 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,           /*  44- 47 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,           /*  48- 51 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,           /*  52- 55 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_P,     _W|_A|_P,           /*  56- 59 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,           /*  60- 63 */
+_W|_A|_P,      _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U|_X,                /*  64- 67 */
+_W|_A|_U|_X,   _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U,           /*  68- 71 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,           /*  72- 75 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,           /*  76- 79 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,           /*  80- 83 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,           /*  84- 87 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_P,           /*  88- 91 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,           /*  92- 95 */
+_W|_A|_P,      _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L|_X,                /*  96- 99 */
+_W|_A|_L|_X,   _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L,           /* 100-103 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,           /* 104-107 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,           /* 108-111 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,           /* 112-115 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,           /* 116-119 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_P,           /* 120-123 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_C,           /* 124-127 */
+_W,            _W,           _W,           _W,                 /* 128-131 */
+_W,            _W,           _W,           _W,                 /* 132-135 */
+_W,            _W,           _W,           _W,                 /* 136-139 */
+_W,            _W,           _W,           _W,                 /* 140-143 */
+_W,            _W,           _W,           _W,                 /* 144-147 */
+_W,            _W,           _W,           _W,                 /* 148-151 */
+_W,            _W,           _W,           _W,                 /* 152-155 */
+_W,            _W,           _W,           _W,                 /* 156-159 */
+_W|_S|_SP,     _W|_P,        _W|_P,        _W|_P,              /* 160-163 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,              /* 164-167 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,              /* 168-171 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,              /* 172-175 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,              /* 176-179 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,              /* 180-183 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,              /* 184-187 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,              /* 188-191 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,              /* 192-195 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,              /* 196-199 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,              /* 200-203 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,              /* 204-207 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,              /* 208-211 */
+_W|_U,         _W|_U,        _W|_U,        _W|_P,              /* 212-215 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,              /* 216-219 */
+_W|_U,         _W|_U,        _W|_U,        _W|_L,              /* 220-223 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,              /* 224-227 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,              /* 228-231 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,              /* 232-235 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,              /* 236-239 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,              /* 240-243 */
+_W|_L,         _W|_L,        _W|_L,        _W|_P,              /* 244-247 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,              /* 248-251 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L};             /* 252-255 */
+
+static inline int match_token(struct ts_fsm_token *t, u8 d)
+{
+       if (t->type)
+               return (token_lookup_tbl[d] & t->type) != 0;
+       else
+               return t->value == d;
+}
+
+static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state)
+{
+       struct ts_fsm *fsm = ts_config_priv(conf);
+       struct ts_fsm_token *cur = NULL, *next;
+       unsigned int match_start, block_idx = 0, tok_idx;
+       unsigned block_len = 0, strict, consumed = state->offset;
+       const u8 *data;
+
+#define GET_NEXT_BLOCK()               \
+({     consumed += block_idx;          \
+       block_idx = 0;                  \
+       block_len = conf->get_next_block(consumed, &data, conf, state); })
+
+#define TOKEN_MISMATCH()               \
+       do {                            \
+               if (strict)             \
+                       goto no_match;  \
+               block_idx++;            \
+               goto startover;         \
+       } while(0)
+
+#define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK())
+
+       if (end_of_data())
+               goto no_match;
+
+       strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE;
+
+startover:
+       match_start = consumed + block_idx;
+
+       for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) {
+               cur = &fsm->tokens[tok_idx];
+
+               if (likely(tok_idx < (fsm->ntokens - 1)))
+                       next = &fsm->tokens[tok_idx + 1];
+               else
+                       next = NULL;
+
+               switch (cur->recur) {
+               case TS_FSM_SINGLE:
+                       if (end_of_data())
+                               goto no_match;
+
+                       if (!match_token(cur, data[block_idx]))
+                               TOKEN_MISMATCH();
+                       break;
+
+               case TS_FSM_PERHAPS:
+                       if (end_of_data() ||
+                           !match_token(cur, data[block_idx]))
+                               continue;
+                       break;
+
+               case TS_FSM_MULTI:
+                       if (end_of_data())
+                               goto no_match;
+
+                       if (!match_token(cur, data[block_idx]))
+                               TOKEN_MISMATCH();
+
+                       block_idx++;
+                       /* fall through */
+
+               case TS_FSM_ANY:
+                       if (next == NULL)
+                               goto found_match;
+
+                       if (end_of_data())
+                               continue;
+
+                       while (!match_token(next, data[block_idx])) {
+                               if (!match_token(cur, data[block_idx]))
+                                       TOKEN_MISMATCH();
+                               block_idx++;
+                               if (end_of_data())
+                                       goto no_match;
+                       }
+                       continue;
+
+               /*
+                * Optimization: Prefer small local loop over jumping
+                * back and forth until garbage at head is munched.
+                */
+               case TS_FSM_HEAD_IGNORE:
+                       if (end_of_data())
+                               continue;
+
+                       while (!match_token(next, data[block_idx])) {
+                               /*
+                                * Special case, don't start over upon
+                                * a mismatch, give the user the
+                                * chance to specify the type of data
+                                * allowed to be ignored.
+                                */
+                               if (!match_token(cur, data[block_idx]))
+                                       goto no_match;
+
+                               block_idx++;
+                               if (end_of_data())
+                                       goto no_match;
+                       }
+
+                       match_start = consumed + block_idx;
+                       continue;
+               }
+
+               block_idx++;
+       }
+
+       if (end_of_data())
+               goto found_match;
+
+no_match:
+       return UINT_MAX;
+
+found_match:
+       state->offset = consumed + block_idx;
+       return match_start;
+}
+
+static struct ts_config *fsm_init(const void *pattern, unsigned int len,
+                                    int gfp_mask)
+{
+       int i, err = -EINVAL;
+       struct ts_config *conf;
+       struct ts_fsm *fsm;
+       struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern;
+       unsigned int ntokens = len / sizeof(*tokens);
+       size_t priv_size = sizeof(*fsm) + len;
+
+       if (len  % sizeof(struct ts_fsm_token) || ntokens < 1)
+               goto errout;
+
+       for (i = 0; i < ntokens; i++) {
+               struct ts_fsm_token *t = &tokens[i];
+
+               if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX)
+                       goto errout;
+
+               if (t->recur == TS_FSM_HEAD_IGNORE &&
+                   (i != 0 || i == (ntokens - 1)))
+                       goto errout;
+       }
+
+       conf = alloc_ts_config(priv_size, gfp_mask);
+       if (IS_ERR(conf))
+               return conf;
+
+       fsm = ts_config_priv(conf);
+       fsm->ntokens = ntokens;
+       memcpy(fsm->tokens, pattern, len);
+
+       for (i = 0; i < fsm->ntokens; i++) {
+               struct ts_fsm_token *t = &fsm->tokens[i];
+               t->type = token_map[t->type];
+       }
+
+       return conf;
+
+errout:
+       return ERR_PTR(err);
+}
+
+static void *fsm_get_pattern(struct ts_config *conf)
+{
+       struct ts_fsm *fsm = ts_config_priv(conf);
+       return fsm->tokens;
+}
+
+static unsigned int fsm_get_pattern_len(struct ts_config *conf)
+{
+       struct ts_fsm *fsm = ts_config_priv(conf);
+       return fsm->ntokens * sizeof(struct ts_fsm_token);
+}
+
+static struct ts_ops fsm_ops = {
+       .name             = "fsm",
+       .find             = fsm_find,
+       .init             = fsm_init,
+       .get_pattern      = fsm_get_pattern,
+       .get_pattern_len  = fsm_get_pattern_len,
+       .owner            = THIS_MODULE,
+       .list             = LIST_HEAD_INIT(fsm_ops.list)
+};
+
+static int __init init_fsm(void)
+{
+       return textsearch_register(&fsm_ops);
+}
+
+static void __exit exit_fsm(void)
+{
+       textsearch_unregister(&fsm_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_fsm);
+module_exit(exit_fsm);