Browse code

first draft of new implementation for regex_list.c

git-svn: trunk@3097

Török Edvin authored on 2007/06/10 01:09:45
Showing 2 changed files
... ...
@@ -1,4 +1,8 @@
1
-Thu May 31 17:43:10 CEST 2007 (edwin)
1
+Sat Jun 09 18:37:00 EEST 2007 (edwin)
2
+------------------------------------
3
+  * libclamav/regex_list.c: first draft of new implementation for regex_list.c 
4
+
5
+Thu May 31 17:43:10 EEST 2007 (edwin)
2 6
 ------------------------------------
3 7
   * libclamav/regex_list.c: handle chaining of multiple OP_DOT in same node.
4 8
   (bug #529)
... ...
@@ -24,7 +24,6 @@
24 24
 #include "clamav-config.h"
25 25
 #endif
26 26
 
27
-
28 27
 #ifndef CL_DEBUG
29 28
 #define NDEBUG
30 29
 #endif
... ...
@@ -35,6 +34,17 @@
35 35
 #endif
36 36
 #endif
37 37
 
38
+
39
+/* TODO: when implementation of new version is complete, enable it in CL_EXPERIMENTAL */
40
+#ifdef CL_EXPERIMENTAL
41
+//#define USE_NEW_VERSION
42
+#endif
43
+
44
+#ifndef USE_NEW_VERSION
45
+/*this is the old version of regex_list.c
46
+ *reason for redesign: there is only one node type that has to handle all the cases: binary search among children, alternatives list, match.
47
+ * This design is very error-prone.*/
48
+
38 49
 #include <stdio.h>
39 50
 #include <stdlib.h>
40 51
 #include <string.h>
... ...
@@ -1493,3 +1503,252 @@ void dump_tree(struct tree_node* root)
1493 1493
 }
1494 1494
 #endif
1495 1495
 
1496
+
1497
+#else
1498
+/*------------------------New version of regex_list.c------------------------*/
1499
+
1500
+/* Regex_list.c: 
1501
+ * A scalable, trie-based implementation for matching against 
1502
+ * a list of regular expressions.
1503
+ *
1504
+ * A trivial way to implement matching against a list of regular expressions 
1505
+ * would have been to construct a single regular expression, by concatenating 
1506
+ * the list with the alternate (|) operator.
1507
+ * BUT a usual DFA implementation of regular expression matching (eg.: GNU libc)
1508
+ * leads to "state explosion" when there are many (5000+) alternate (|) operators.
1509
+ * This is the reason for using a trie-based implementation.
1510
+ *
1511
+ *
1512
+ * Design considerations:
1513
+ *
1514
+ * Recursive call points: there are situations when match has to be retried on a different sub-trie, or with a different repeat count.
1515
+ * Alternate operators, and repeat/range operators (+,*,{}) are recursiv call points. When a failure is encountered during a match,
1516
+ * the function simply returns from the recursive call, and ends up at a failure point (recursive call point).
1517
+ *
1518
+ * "go to parent" below actually means, return from recursive call.
1519
+ *
1520
+ * Node types:
1521
+ * 	OP_ROOT: contains information that applies to the entire trie.
1522
+ * 		it can only appear as root node, and not as child node.
1523
+ * 		On child fail: match has failed
1524
+ * 		This is a recursive call point
1525
+ * 	OP_CHAR_BINARY_SEARCH: chooses a sub-trie, based on current character; 
1526
+ * 			using binary-search
1527
+ * 			On fail: go to node indicated by fail_action, or if 
1528
+ * 				fail_action is NULL, to parent
1529
+ * 			On child fail: execute fail of current node
1530
+ * 	OP_ALTERNATIVES: try matching each sub-trie, if all fails execute fail
1531
+ * 		action of current node. This is a recursive call point
1532
+ * 	OP_CHAR_REPEAT: repeat specified character a number of times in range:
1533
+ *		[min_range, max_range]; 
1534
+ *			min_range: 0 for * operator
1535
+ *				   1 for + operator
1536
+ *			max_range: remaining length of current string for *,+ operator
1537
+ *			OR: min_range, max_range as specified by the {min,max} operator
1538
+ *		On fail: fail_action, or parent if NULL
1539
+ *		On child fail: reduce match repeat count, try again on child, if
1540
+ *			repeat count<min_range, execute fail of current node
1541
+ *		Not recomended to use this when min_range=max_range=1
1542
+ *		This is a recursive call point
1543
+ *	OP_DOT_REPEAT: like OP_CHAR_REPEAT but accept any character
1544
+ *		Not recomended to use this when min_range=max_range=1
1545
+ *		This is a recursive call point
1546
+ *	OP_GROUP_START: start of a group "(", also specifies group flags:
1547
+ *		repeat: is_repeat, min_range, max_range
1548
+ *		This is a recursive call point if is_repeat
1549
+ *	OP_GROUP_END: end of group ")"
1550
+ *      OP_STRCMP: compare with specified string,
1551
+ *      	   it has an array of fail actions, one for each character
1552
+ *      	   default fail action: go to parent
1553
+ *      	   This was introduced from memory- and speed-efficiency
1554
+ *      	   considerations. 
1555
+ *      OP_CHAR_CLASS_REPEAT: match character with character class
1556
+ *      	min_range, max_range
1557
+ *      	For a single character class min_range=max_range=1
1558
+ *	OP_MATCH_OK: match has succeeded
1559
+ *
1560
+ * TODO: maybe we'll need a more efficient way to choose between character classes.
1561
+ *       OP_DOT_REPEAT/OP_CHAR_REPEAT needs a more efficient specification of its failure function, instead of using
1562
+ *       backtracking approach.
1563
+ *
1564
+ * The failure function/action is just for optimization, the match algorithms works even without it.
1565
+ * TODO:In this first draft fail action will always be NULL, in a later version I'll implement fail actions too.
1566
+ *
1567
+ *
1568
+ */ 
1569
+
1570
+#include "cltypes.h"
1571
+#include "others.h"
1572
+
1573
+/* offsetof is not ANSI C */
1574
+#ifndef offsetof
1575
+#   define offsetof(type,memb) ((size_t)&((type*)0)->memb)
1576
+#endif
1577
+
1578
+#define container_of(ptr, type, member) ( (type *) ((char *)ptr - offsetof(type, member)) )
1579
+#define container_of_const(ptr, type, member) ( (type *) ((const char *)ptr - offsetof(type, member)) )
1580
+
1581
+enum trie_node_type {
1582
+	OP_ROOT,
1583
+	OP_CHAR_BINARY_SEARCH,
1584
+	OP_ALTERNATIVES,
1585
+	OP_CHAR_REPEAT,
1586
+	OP_DOT_REPEAT,
1587
+	OP_CHAR_CLASS_REPEAT,
1588
+	OP_STRCMP,
1589
+	OP_GROUP_START,
1590
+	OP_GROUP_END,
1591
+	OP_MATCH_OK
1592
+};
1593
+
1594
+
1595
+/* the comon definition of a trie node */
1596
+struct trie_node
1597
+{
1598
+	enum trie_node_type type;
1599
+};
1600
+
1601
+struct trie_node_match {
1602
+	struct trie_node node;
1603
+	/* additional match info */
1604
+};
1605
+
1606
+struct trie_node_root
1607
+{
1608
+	struct trie_node node;
1609
+	struct trie_node* child;
1610
+};
1611
+
1612
+struct trie_node_binary_search
1613
+{
1614
+	struct trie_node node;
1615
+	uint8_t children_count;/* number of children to search among -1! 255 = 256 children*/	
1616
+	struct trie_node* fail_action;
1617
+	struct trie_node** children;
1618
+};
1619
+
1620
+struct trie_node_alternatives
1621
+{
1622
+	struct trie_node node;
1623
+	uint32_t alternatives_count;
1624
+	/* need to support node with lots of alternatives, 
1625
+	 * for a worst-case scenario where each line ends up as a sub-trie of OP_ALTERNATIVES*/
1626
+	struct trie_node* fail_action;
1627
+	struct trie_node** children;
1628
+};
1629
+
1630
+struct trie_node_char_repeat
1631
+{
1632
+	struct trie_node node;
1633
+	unsigned char character;
1634
+	uint8_t range_min, range_max;/* according to POSIX we need not support more than 255 repetitions*/
1635
+	struct trie_node* child;
1636
+	struct trie_node* fail_action;
1637
+};
1638
+
1639
+struct trie_node_dot_repeat
1640
+{
1641
+	struct trie_node node;
1642
+	uint8_t range_min, range_max;/* according to POSIX we need not support more than 255 repetitions*/
1643
+	struct trie_node* child;
1644
+	struct trie_node* fail_action;
1645
+};
1646
+
1647
+struct trie_node_group_start
1648
+{
1649
+	struct trie_node node;
1650
+	uint8_t range_min, range_max;/* if range_min==range_max==1, then this is NOT a repeat, thus not a recursive call point*/
1651
+	struct trie_node* child;
1652
+	struct trie_node* fail_action;	
1653
+};
1654
+
1655
+struct trie_node_group_end
1656
+{
1657
+	struct trie_node node;
1658
+	struct trie_node* child;
1659
+};
1660
+
1661
+struct trie_node_strcmp
1662
+{
1663
+	struct trie_node node;
1664
+	uint8_t string_length;/* for longer strings a sequence of node_strcmp should be used */
1665
+	unsigned char* string;
1666
+	struct trie_node* child;
1667
+	struct trie_node** fail_actions;/* this has string_length elements */
1668
+};
1669
+
1670
+struct trie_node_char_class_repeat
1671
+{
1672
+	struct trie_node node;
1673
+	struct char_bitmap* bitmap;
1674
+	uint8_t range_min, range_max;
1675
+	struct trie_node* child;
1676
+	struct trie_node* fail_action;
1677
+};
1678
+
1679
+
1680
+static int match_node(const struct trie_node* node)
1681
+{
1682
+	while(node->type != OP_MATCH_OK) {	
1683
+		switch(node->type) {
1684
+			case OP_ROOT:
1685
+				{	
1686
+					const struct trie_node_root* root_node = container_of_const(node, const struct trie_node_root, node);
1687
+					node = root_node->child;
1688
+					break;
1689
+				}
1690
+			case OP_CHAR_BINARY_SEARCH:
1691
+				{
1692
+					const struct trie_node_binary_search* bin_node = container_of_const(node, const struct trie_node_binary_search, node);
1693
+					/* TODO: binary search */
1694
+					break;
1695
+				}
1696
+			case OP_ALTERNATIVES:
1697
+				{
1698
+					const struct trie_node_alternatives* alt_node = container_of_const(node, const struct trie_node_alternatives, node);
1699
+					/* TODO: op_alt */
1700
+					break;
1701
+				}
1702
+			case OP_CHAR_REPEAT:
1703
+				{
1704
+					const struct trie_node_char_repeat* char_rep_node = container_of_const(node, const struct trie_node_char_repeat, node);
1705
+					break;
1706
+				}
1707
+			case OP_DOT_REPEAT:
1708
+				{
1709
+					const struct trie_node_dot_repeat* dot_rep_node = container_of_const(node, const struct trie_node_dot_repeat, node);
1710
+					break;
1711
+				}
1712
+			case OP_CHAR_CLASS_REPEAT:
1713
+				{
1714
+					const struct trie_node_char_class_repeat* class_rep_node = container_of_const(node, const struct trie_node_char_class_repeat, node);
1715
+					break;
1716
+				}
1717
+			case OP_STRCMP:
1718
+				{
1719
+					const struct trie_node_strcmp* strcmp_node = container_of_const(node, const struct trie_node_strcmp, node);
1720
+					break;
1721
+				}
1722
+			case OP_GROUP_START:
1723
+				{
1724
+					const struct trie_node_group_start* group_start_node = container_of_const(node, const struct trie_node_group_start, node);
1725
+					break;
1726
+				}
1727
+			case OP_GROUP_END:
1728
+				{
1729
+					const struct trie_node_group_end* group_end_node = container_of_const(node, const struct trie_node_group_end, node);
1730
+					break;
1731
+				}
1732
+			default:
1733
+				{
1734
+					cli_warnmsg("Unimplemented node type:%d", node->type);
1735
+					return 0;
1736
+					break;
1737
+				}
1738
+		}
1739
+	}
1740
+	return 1;/* match */
1741
+}
1742
+
1743
+#endif
1744
+