Browse code

continue draft of new regex_list.c

git-svn: trunk@3098

Török Edvin authored on 2007/06/10 05:46:40
Showing 2 changed files
... ...
@@ -1,3 +1,7 @@
1
+Sat Jun 09 23:16:00 EEST 2007 (edwin)
2
+------------------------------------
3
+  * libclamav/regex_list.c: draft of new regex_list.c
4
+
1 5
 Sat Jun 09 18:37:00 EEST 2007 (edwin)
2 6
 ------------------------------------
3 7
   * libclamav/regex_list.c: first draft of new implementation for regex_list.c 
... ...
@@ -1527,11 +1527,14 @@ void dump_tree(struct tree_node* root)
1527 1527
  *
1528 1528
  * "go to parent" below actually means, return from recursive call.
1529 1529
  *
1530
+ * fail_action: we need to return to closest failure point (recursive call point),
1531
+ *  and switch current node to node pointed by fail_action
1532
+ *
1530 1533
  * Node types:
1531 1534
  * 	OP_ROOT: contains information that applies to the entire trie.
1532 1535
  * 		it can only appear as root node, and not as child node.
1533 1536
  * 		On child fail: match has failed
1534
- * 		This is a recursive call point
1537
+ * 		This is NOT a recursive call point
1535 1538
  * 	OP_CHAR_BINARY_SEARCH: chooses a sub-trie, based on current character; 
1536 1539
  * 			using binary-search
1537 1540
  * 			On fail: go to node indicated by fail_action, or if 
... ...
@@ -1548,6 +1551,8 @@ void dump_tree(struct tree_node* root)
1548 1548
  *		On fail: fail_action, or parent if NULL
1549 1549
  *		On child fail: reduce match repeat count, try again on child, if
1550 1550
  *			repeat count<min_range, execute fail of current node
1551
+ *		Also has a bitmap on what characters are accepted beyond it,
1552
+ *		as an optimizations for the case, when a maximum match isn't possible
1551 1553
  *		Not recomended to use this when min_range=max_range=1
1552 1554
  *		This is a recursive call point
1553 1555
  *	OP_DOT_REPEAT: like OP_CHAR_REPEAT but accept any character
... ...
@@ -1577,6 +1582,7 @@ void dump_tree(struct tree_node* root)
1577 1577
  *
1578 1578
  */ 
1579 1579
 
1580
+#include <string.h>
1580 1581
 #include "cltypes.h"
1581 1582
 #include "others.h"
1582 1583
 
... ...
@@ -1624,7 +1630,8 @@ struct trie_node_binary_search
1624 1624
 	struct trie_node node;
1625 1625
 	uint8_t children_count;/* number of children to search among -1! 255 = 256 children*/	
1626 1626
 	struct trie_node* fail_action;
1627
-	struct trie_node** children;
1627
+	unsigned char* char_choices;/* children_count elements */
1628
+	struct trie_node** children;/*children_count elements */
1628 1629
 };
1629 1630
 
1630 1631
 struct trie_node_alternatives
... ...
@@ -1642,6 +1649,9 @@ struct trie_node_char_repeat
1642 1642
 	struct trie_node node;
1643 1643
 	unsigned char character;
1644 1644
 	uint8_t range_min, range_max;/* according to POSIX we need not support more than 255 repetitions*/
1645
+	struct char_bitmap* bitmap_accept_after;/* bitmap of characters accepted after this, 
1646
+						   to optimize repeat < max_range case; if its NULL
1647
+						   there is no optimization*/
1645 1648
 	struct trie_node* child;
1646 1649
 	struct trie_node* fail_action;
1647 1650
 };
... ...
@@ -1650,6 +1660,9 @@ struct trie_node_dot_repeat
1650 1650
 {
1651 1651
 	struct trie_node node;
1652 1652
 	uint8_t range_min, range_max;/* according to POSIX we need not support more than 255 repetitions*/
1653
+	struct char_bitmap* bitmap_accept_after;/* bitmap of characters accepted after this, 
1654
+						   to optimize repeat < max_range case; if its NULL
1655
+						   there is no optimization*/
1653 1656
 	struct trie_node* child;
1654 1657
 	struct trie_node* fail_action;
1655 1658
 };
... ...
@@ -1674,22 +1687,65 @@ struct trie_node_strcmp
1674 1674
 	uint8_t string_length;/* for longer strings a sequence of node_strcmp should be used */
1675 1675
 	unsigned char* string;
1676 1676
 	struct trie_node* child;
1677
-	struct trie_node** fail_actions;/* this has string_length elements */
1677
+	struct trie_node** fail_actions;/* this has string_length elements, or NULL if no fail_actions are computed */
1678 1678
 };
1679 1679
 
1680 1680
 struct trie_node_char_class_repeat
1681 1681
 {
1682 1682
 	struct trie_node node;
1683 1683
 	struct char_bitmap* bitmap;
1684
+	struct char_bitmap* bitmap_accept_after;
1684 1685
 	uint8_t range_min, range_max;
1685 1686
 	struct trie_node* child;
1686 1687
 	struct trie_node* fail_action;
1687 1688
 };
1688 1689
 
1690
+static inline int bitmap_accepts(const struct char_bitmap* bitmap, const char c)
1691
+{
1692
+	/* TODO: check if c is accepted by bitmap */
1693
+	return 0;
1694
+}
1695
+
1696
+#define MATCH_FAILED 0
1697
+#define MATCH_OK     1
1689 1698
 
1690
-static int match_node(const struct trie_node* node)
1699
+#define FAIL_ACTION( fail_node ) (*fail_action = (fail_node), MATCH_FAILED)
1700
+
1701
+
1702
+#ifndef MIN
1703
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
1704
+#endif
1705
+
1706
+static int match_node(const struct trie_node* node, const unsigned char* text, const unsigned char* text_end, const struct trie_node** fail_action);
1707
+
1708
+static int match_repeat(const unsigned char* text, const unsigned char* text_end, const size_t range_min, const size_t repeat_start, 
1709
+		const struct char_bitmap* bitmap_accept_after, const struct trie_node* child, const struct trie_node** fail_action,
1710
+		const struct trie_node* this_fail_action)
1691 1711
 {
1692
-	while(node->type != OP_MATCH_OK) {	
1712
+	size_t i;
1713
+	for(i = repeat_start;i > range_min;i--) {
1714
+		if(!bitmap_accept_after || bitmap_accepts( bitmap_accept_after, text[i-1])) {
1715
+			int rc = match_node(child, &text[i], text_end, fail_action);
1716
+			/* ignore fail_action for now, we have the bitmap_accepts_after optimization */
1717
+			if(rc) {
1718
+				return MATCH_OK;
1719
+			}
1720
+		}						
1721
+	}
1722
+	if(!range_min) {
1723
+		/* this match is optional, try child only */
1724
+		int rc = match_node(child, text, text_end, fail_action);
1725
+		if(rc) {
1726
+			return MATCH_OK;
1727
+		}
1728
+	}
1729
+	return FAIL_ACTION(this_fail_action);
1730
+}
1731
+
1732
+/* text_end points to \0 in text */
1733
+static int match_node(const struct trie_node* node, const unsigned char* text, const unsigned char* text_end, const struct trie_node** fail_action)
1734
+{
1735
+	while(node && text < text_end) {	
1693 1736
 		switch(node->type) {
1694 1737
 			case OP_ROOT:
1695 1738
 				{	
... ...
@@ -1698,56 +1754,151 @@ static int match_node(const struct trie_node* node)
1698 1698
 					break;
1699 1699
 				}
1700 1700
 			case OP_CHAR_BINARY_SEARCH:
1701
-				{
1701
+				{					
1702 1702
 					const struct trie_node_binary_search* bin_node = container_of_const(node, const struct trie_node_binary_search, node);
1703
-					/* TODO: binary search */
1703
+					const unsigned char csearch = *text;
1704
+					size_t mid, left = 0, right = bin_node->children_count-1;					
1705
+					while(left<=right) {
1706
+						mid = left+(right-left)/2;
1707
+						if(bin_node->char_choices[mid] == csearch)
1708
+							break;
1709
+						else if(bin_node->char_choices[mid] < csearch)
1710
+							left = mid+1;
1711
+						else
1712
+							right = mid-1;
1713
+					}
1714
+					if(left <= right) {
1715
+						/* match successful */
1716
+						node = bin_node->children[mid];
1717
+						++text;
1718
+					}
1719
+					else {
1720
+						return FAIL_ACTION( bin_node->fail_action );
1721
+					}
1704 1722
 					break;
1705 1723
 				}
1706 1724
 			case OP_ALTERNATIVES:
1707 1725
 				{
1708 1726
 					const struct trie_node_alternatives* alt_node = container_of_const(node, const struct trie_node_alternatives, node);
1709
-					/* TODO: op_alt */
1727
+					size_t i;
1728
+					*fail_action = NULL;
1729
+					for(i=0;i < alt_node->alternatives_count;i++) {
1730
+						int rc = match_node(alt_node->children[i], text, text_end, fail_action);
1731
+						if(rc) {							
1732
+							return MATCH_OK;
1733
+						}
1734
+						/* supporting fail_actions is tricky,
1735
+						 *  if we just go to the node specified, what happens if the match fails, and no
1736
+						 *  further fail_action is specified? We should know where to continue the search.
1737
+						 * For now fail_action isn't supported for OP_ALTERNATIVES*/						
1738
+					}
1710 1739
 					break;
1711 1740
 				}
1712 1741
 			case OP_CHAR_REPEAT:
1713 1742
 				{
1714 1743
 					const struct trie_node_char_repeat* char_rep_node = container_of_const(node, const struct trie_node_char_repeat, node);
1715
-					break;
1744
+					const size_t max_len = MIN( text_end - text, char_rep_node->range_max-1);
1745
+					/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
1746
+					const char caccept = char_rep_node->character;
1747
+					size_t rep;
1748
+
1749
+					if(max_len < char_rep_node->range_min)
1750
+						return FAIL_ACTION(char_rep_node->fail_action);
1751
+
1752
+					for(rep=0;rep < max_len;rep++) {
1753
+						if(text[rep] != caccept) {
1754
+							break;
1755
+						}
1756
+					}
1757
+
1758
+					return match_repeat(text, text_end, char_rep_node->range_min, rep,
1759
+							char_rep_node->bitmap_accept_after, char_rep_node->child, fail_action,
1760
+							char_rep_node->fail_action);
1716 1761
 				}
1717 1762
 			case OP_DOT_REPEAT:
1718 1763
 				{
1719 1764
 					const struct trie_node_dot_repeat* dot_rep_node = container_of_const(node, const struct trie_node_dot_repeat, node);
1720
-					break;
1765
+					const size_t max_len = MIN( text_end - text, dot_rep_node->range_max-1);
1766
+					/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
1767
+
1768
+					if(max_len < dot_rep_node->range_min)
1769
+						return FAIL_ACTION(dot_rep_node->fail_action);
1770
+
1771
+					return match_repeat(text, text_end, dot_rep_node->range_min, max_len,
1772
+							dot_rep_node->bitmap_accept_after, dot_rep_node->child, fail_action,
1773
+							dot_rep_node->fail_action);
1721 1774
 				}
1722 1775
 			case OP_CHAR_CLASS_REPEAT:
1723 1776
 				{
1724 1777
 					const struct trie_node_char_class_repeat* class_rep_node = container_of_const(node, const struct trie_node_char_class_repeat, node);
1778
+					const size_t max_len = MIN( text_end - text, class_rep_node->range_max-1);
1779
+					/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
1780
+					size_t rep;
1781
+
1782
+					if(max_len < class_rep_node->range_min)
1783
+						return FAIL_ACTION(class_rep_node->fail_action);
1784
+
1785
+					for(rep=0;rep < max_len;rep++) {
1786
+						if(!bitmap_accepts( class_rep_node->bitmap, text[rep])) {
1787
+							break;
1788
+						}
1789
+					}
1790
+
1791
+					return match_repeat(text, text_end, class_rep_node->range_min, rep,
1792
+							class_rep_node->bitmap_accept_after, class_rep_node->child, fail_action,
1793
+							class_rep_node->fail_action);
1725 1794
 					break;
1726 1795
 				}
1727 1796
 			case OP_STRCMP:
1728 1797
 				{
1729 1798
 					const struct trie_node_strcmp* strcmp_node = container_of_const(node, const struct trie_node_strcmp, node);
1799
+					size_t i;
1800
+					if(strcmp_node->fail_actions) {
1801
+						const size_t max_len = MIN(strcmp_node->string_length, text_end-text);
1802
+						/* we don't use strncmp, because we need the exact match-fail point */
1803
+						for(i=0;i < max_len;i++) {
1804
+							if(text[i] != strcmp_node->string[i]) {
1805
+								return FAIL_ACTION( strcmp_node->fail_actions[i] );
1806
+							}
1807
+						}
1808
+						if(max_len < strcmp_node->string_length) {
1809
+							/* failed, because text was shorter */
1810
+							return FAIL_ACTION( strcmp_node->fail_actions[max_len] );
1811
+						}
1812
+					}
1813
+					else {
1814
+						/* no fail_actions computed, some shortcuts possible on compare */
1815
+						if((text_end - text < strcmp_node->string_length) ||
1816
+								strncmp((const char*)text, (const char*)strcmp_node->string, strcmp_node->string_length)) {
1817
+
1818
+							return FAIL_ACTION( NULL );
1819
+						}
1820
+					}
1821
+					/* match successful */
1822
+					node = strcmp_node->child;
1823
+					text += strcmp_node->string_length;
1730 1824
 					break;
1731 1825
 				}
1732 1826
 			case OP_GROUP_START:
1733 1827
 				{
1734 1828
 					const struct trie_node_group_start* group_start_node = container_of_const(node, const struct trie_node_group_start, node);
1829
+					/* TODO: implement */
1735 1830
 					break;
1736 1831
 				}
1737 1832
 			case OP_GROUP_END:
1738
-				{
1833
+				{					
1739 1834
 					const struct trie_node_group_end* group_end_node = container_of_const(node, const struct trie_node_group_end, node);
1835
+					/* TODO: implement */
1740 1836
 					break;
1741 1837
 				}
1742
-			default:
1838
+			case OP_MATCH_OK:
1743 1839
 				{
1744
-					cli_warnmsg("Unimplemented node type:%d", node->type);
1745
-					return 0;
1746
-					break;
1840
+					return MATCH_OK;
1747 1841
 				}
1748 1842
 		}
1749 1843
 	}
1750
-	return 1;/* match */
1844
+	/* if fail_action was NULL, or text ended*/
1845
+	return MATCH_FAILED;
1751 1846
 }
1752 1847
 
1753 1848
 #endif