git-svn: trunk@3097
Török Edvin authored on 2007/06/10 01:09:45... | ... |
@@ -1,4 +1,8 @@ |
1 |
-Thu May 31 17:43:10 CEST 2007 (edwin) |
|
1 |
+Sat Jun 09 18:37:00 EEST 2007 (edwin) |
|
2 |
+------------------------------------ |
|
3 |
+ * libclamav/regex_list.c: first draft of new implementation for regex_list.c |
|
4 |
+ |
|
5 |
+Thu May 31 17:43:10 EEST 2007 (edwin) |
|
2 | 6 |
------------------------------------ |
3 | 7 |
* libclamav/regex_list.c: handle chaining of multiple OP_DOT in same node. |
4 | 8 |
(bug #529) |
... | ... |
@@ -24,7 +24,6 @@ |
24 | 24 |
#include "clamav-config.h" |
25 | 25 |
#endif |
26 | 26 |
|
27 |
- |
|
28 | 27 |
#ifndef CL_DEBUG |
29 | 28 |
#define NDEBUG |
30 | 29 |
#endif |
... | ... |
@@ -35,6 +34,17 @@ |
35 | 35 |
#endif |
36 | 36 |
#endif |
37 | 37 |
|
38 |
+ |
|
39 |
+/* TODO: when implementation of new version is complete, enable it in CL_EXPERIMENTAL */ |
|
40 |
+#ifdef CL_EXPERIMENTAL |
|
41 |
+//#define USE_NEW_VERSION |
|
42 |
+#endif |
|
43 |
+ |
|
44 |
+#ifndef USE_NEW_VERSION |
|
45 |
+/*this is the old version of regex_list.c |
|
46 |
+ *reason for redesign: there is only one node type that has to handle all the cases: binary search among children, alternatives list, match. |
|
47 |
+ * This design is very error-prone.*/ |
|
48 |
+ |
|
38 | 49 |
#include <stdio.h> |
39 | 50 |
#include <stdlib.h> |
40 | 51 |
#include <string.h> |
... | ... |
@@ -1493,3 +1503,252 @@ void dump_tree(struct tree_node* root) |
1493 | 1493 |
} |
1494 | 1494 |
#endif |
1495 | 1495 |
|
1496 |
+ |
|
1497 |
+#else |
|
1498 |
+/*------------------------New version of regex_list.c------------------------*/ |
|
1499 |
+ |
|
1500 |
+/* Regex_list.c: |
|
1501 |
+ * A scalable, trie-based implementation for matching against |
|
1502 |
+ * a list of regular expressions. |
|
1503 |
+ * |
|
1504 |
+ * A trivial way to implement matching against a list of regular expressions |
|
1505 |
+ * would have been to construct a single regular expression, by concatenating |
|
1506 |
+ * the list with the alternate (|) operator. |
|
1507 |
+ * BUT a usual DFA implementation of regular expression matching (eg.: GNU libc) |
|
1508 |
+ * leads to "state explosion" when there are many (5000+) alternate (|) operators. |
|
1509 |
+ * This is the reason for using a trie-based implementation. |
|
1510 |
+ * |
|
1511 |
+ * |
|
1512 |
+ * Design considerations: |
|
1513 |
+ * |
|
1514 |
+ * Recursive call points: there are situations when match has to be retried on a different sub-trie, or with a different repeat count. |
|
1515 |
+ * Alternate operators, and repeat/range operators (+,*,{}) are recursiv call points. When a failure is encountered during a match, |
|
1516 |
+ * the function simply returns from the recursive call, and ends up at a failure point (recursive call point). |
|
1517 |
+ * |
|
1518 |
+ * "go to parent" below actually means, return from recursive call. |
|
1519 |
+ * |
|
1520 |
+ * Node types: |
|
1521 |
+ * OP_ROOT: contains information that applies to the entire trie. |
|
1522 |
+ * it can only appear as root node, and not as child node. |
|
1523 |
+ * On child fail: match has failed |
|
1524 |
+ * This is a recursive call point |
|
1525 |
+ * OP_CHAR_BINARY_SEARCH: chooses a sub-trie, based on current character; |
|
1526 |
+ * using binary-search |
|
1527 |
+ * On fail: go to node indicated by fail_action, or if |
|
1528 |
+ * fail_action is NULL, to parent |
|
1529 |
+ * On child fail: execute fail of current node |
|
1530 |
+ * OP_ALTERNATIVES: try matching each sub-trie, if all fails execute fail |
|
1531 |
+ * action of current node. This is a recursive call point |
|
1532 |
+ * OP_CHAR_REPEAT: repeat specified character a number of times in range: |
|
1533 |
+ * [min_range, max_range]; |
|
1534 |
+ * min_range: 0 for * operator |
|
1535 |
+ * 1 for + operator |
|
1536 |
+ * max_range: remaining length of current string for *,+ operator |
|
1537 |
+ * OR: min_range, max_range as specified by the {min,max} operator |
|
1538 |
+ * On fail: fail_action, or parent if NULL |
|
1539 |
+ * On child fail: reduce match repeat count, try again on child, if |
|
1540 |
+ * repeat count<min_range, execute fail of current node |
|
1541 |
+ * Not recomended to use this when min_range=max_range=1 |
|
1542 |
+ * This is a recursive call point |
|
1543 |
+ * OP_DOT_REPEAT: like OP_CHAR_REPEAT but accept any character |
|
1544 |
+ * Not recomended to use this when min_range=max_range=1 |
|
1545 |
+ * This is a recursive call point |
|
1546 |
+ * OP_GROUP_START: start of a group "(", also specifies group flags: |
|
1547 |
+ * repeat: is_repeat, min_range, max_range |
|
1548 |
+ * This is a recursive call point if is_repeat |
|
1549 |
+ * OP_GROUP_END: end of group ")" |
|
1550 |
+ * OP_STRCMP: compare with specified string, |
|
1551 |
+ * it has an array of fail actions, one for each character |
|
1552 |
+ * default fail action: go to parent |
|
1553 |
+ * This was introduced from memory- and speed-efficiency |
|
1554 |
+ * considerations. |
|
1555 |
+ * OP_CHAR_CLASS_REPEAT: match character with character class |
|
1556 |
+ * min_range, max_range |
|
1557 |
+ * For a single character class min_range=max_range=1 |
|
1558 |
+ * OP_MATCH_OK: match has succeeded |
|
1559 |
+ * |
|
1560 |
+ * TODO: maybe we'll need a more efficient way to choose between character classes. |
|
1561 |
+ * OP_DOT_REPEAT/OP_CHAR_REPEAT needs a more efficient specification of its failure function, instead of using |
|
1562 |
+ * backtracking approach. |
|
1563 |
+ * |
|
1564 |
+ * The failure function/action is just for optimization, the match algorithms works even without it. |
|
1565 |
+ * TODO:In this first draft fail action will always be NULL, in a later version I'll implement fail actions too. |
|
1566 |
+ * |
|
1567 |
+ * |
|
1568 |
+ */ |
|
1569 |
+ |
|
1570 |
+#include "cltypes.h" |
|
1571 |
+#include "others.h" |
|
1572 |
+ |
|
1573 |
+/* offsetof is not ANSI C */ |
|
1574 |
+#ifndef offsetof |
|
1575 |
+# define offsetof(type,memb) ((size_t)&((type*)0)->memb) |
|
1576 |
+#endif |
|
1577 |
+ |
|
1578 |
+#define container_of(ptr, type, member) ( (type *) ((char *)ptr - offsetof(type, member)) ) |
|
1579 |
+#define container_of_const(ptr, type, member) ( (type *) ((const char *)ptr - offsetof(type, member)) ) |
|
1580 |
+ |
|
1581 |
+enum trie_node_type { |
|
1582 |
+ OP_ROOT, |
|
1583 |
+ OP_CHAR_BINARY_SEARCH, |
|
1584 |
+ OP_ALTERNATIVES, |
|
1585 |
+ OP_CHAR_REPEAT, |
|
1586 |
+ OP_DOT_REPEAT, |
|
1587 |
+ OP_CHAR_CLASS_REPEAT, |
|
1588 |
+ OP_STRCMP, |
|
1589 |
+ OP_GROUP_START, |
|
1590 |
+ OP_GROUP_END, |
|
1591 |
+ OP_MATCH_OK |
|
1592 |
+}; |
|
1593 |
+ |
|
1594 |
+ |
|
1595 |
+/* the comon definition of a trie node */ |
|
1596 |
+struct trie_node |
|
1597 |
+{ |
|
1598 |
+ enum trie_node_type type; |
|
1599 |
+}; |
|
1600 |
+ |
|
1601 |
+struct trie_node_match { |
|
1602 |
+ struct trie_node node; |
|
1603 |
+ /* additional match info */ |
|
1604 |
+}; |
|
1605 |
+ |
|
1606 |
+struct trie_node_root |
|
1607 |
+{ |
|
1608 |
+ struct trie_node node; |
|
1609 |
+ struct trie_node* child; |
|
1610 |
+}; |
|
1611 |
+ |
|
1612 |
+struct trie_node_binary_search |
|
1613 |
+{ |
|
1614 |
+ struct trie_node node; |
|
1615 |
+ uint8_t children_count;/* number of children to search among -1! 255 = 256 children*/ |
|
1616 |
+ struct trie_node* fail_action; |
|
1617 |
+ struct trie_node** children; |
|
1618 |
+}; |
|
1619 |
+ |
|
1620 |
+struct trie_node_alternatives |
|
1621 |
+{ |
|
1622 |
+ struct trie_node node; |
|
1623 |
+ uint32_t alternatives_count; |
|
1624 |
+ /* need to support node with lots of alternatives, |
|
1625 |
+ * for a worst-case scenario where each line ends up as a sub-trie of OP_ALTERNATIVES*/ |
|
1626 |
+ struct trie_node* fail_action; |
|
1627 |
+ struct trie_node** children; |
|
1628 |
+}; |
|
1629 |
+ |
|
1630 |
+struct trie_node_char_repeat |
|
1631 |
+{ |
|
1632 |
+ struct trie_node node; |
|
1633 |
+ unsigned char character; |
|
1634 |
+ uint8_t range_min, range_max;/* according to POSIX we need not support more than 255 repetitions*/ |
|
1635 |
+ struct trie_node* child; |
|
1636 |
+ struct trie_node* fail_action; |
|
1637 |
+}; |
|
1638 |
+ |
|
1639 |
+struct trie_node_dot_repeat |
|
1640 |
+{ |
|
1641 |
+ struct trie_node node; |
|
1642 |
+ uint8_t range_min, range_max;/* according to POSIX we need not support more than 255 repetitions*/ |
|
1643 |
+ struct trie_node* child; |
|
1644 |
+ struct trie_node* fail_action; |
|
1645 |
+}; |
|
1646 |
+ |
|
1647 |
+struct trie_node_group_start |
|
1648 |
+{ |
|
1649 |
+ struct trie_node node; |
|
1650 |
+ uint8_t range_min, range_max;/* if range_min==range_max==1, then this is NOT a repeat, thus not a recursive call point*/ |
|
1651 |
+ struct trie_node* child; |
|
1652 |
+ struct trie_node* fail_action; |
|
1653 |
+}; |
|
1654 |
+ |
|
1655 |
+struct trie_node_group_end |
|
1656 |
+{ |
|
1657 |
+ struct trie_node node; |
|
1658 |
+ struct trie_node* child; |
|
1659 |
+}; |
|
1660 |
+ |
|
1661 |
+struct trie_node_strcmp |
|
1662 |
+{ |
|
1663 |
+ struct trie_node node; |
|
1664 |
+ uint8_t string_length;/* for longer strings a sequence of node_strcmp should be used */ |
|
1665 |
+ unsigned char* string; |
|
1666 |
+ struct trie_node* child; |
|
1667 |
+ struct trie_node** fail_actions;/* this has string_length elements */ |
|
1668 |
+}; |
|
1669 |
+ |
|
1670 |
+struct trie_node_char_class_repeat |
|
1671 |
+{ |
|
1672 |
+ struct trie_node node; |
|
1673 |
+ struct char_bitmap* bitmap; |
|
1674 |
+ uint8_t range_min, range_max; |
|
1675 |
+ struct trie_node* child; |
|
1676 |
+ struct trie_node* fail_action; |
|
1677 |
+}; |
|
1678 |
+ |
|
1679 |
+ |
|
1680 |
+static int match_node(const struct trie_node* node) |
|
1681 |
+{ |
|
1682 |
+ while(node->type != OP_MATCH_OK) { |
|
1683 |
+ switch(node->type) { |
|
1684 |
+ case OP_ROOT: |
|
1685 |
+ { |
|
1686 |
+ const struct trie_node_root* root_node = container_of_const(node, const struct trie_node_root, node); |
|
1687 |
+ node = root_node->child; |
|
1688 |
+ break; |
|
1689 |
+ } |
|
1690 |
+ case OP_CHAR_BINARY_SEARCH: |
|
1691 |
+ { |
|
1692 |
+ const struct trie_node_binary_search* bin_node = container_of_const(node, const struct trie_node_binary_search, node); |
|
1693 |
+ /* TODO: binary search */ |
|
1694 |
+ break; |
|
1695 |
+ } |
|
1696 |
+ case OP_ALTERNATIVES: |
|
1697 |
+ { |
|
1698 |
+ const struct trie_node_alternatives* alt_node = container_of_const(node, const struct trie_node_alternatives, node); |
|
1699 |
+ /* TODO: op_alt */ |
|
1700 |
+ break; |
|
1701 |
+ } |
|
1702 |
+ case OP_CHAR_REPEAT: |
|
1703 |
+ { |
|
1704 |
+ const struct trie_node_char_repeat* char_rep_node = container_of_const(node, const struct trie_node_char_repeat, node); |
|
1705 |
+ break; |
|
1706 |
+ } |
|
1707 |
+ case OP_DOT_REPEAT: |
|
1708 |
+ { |
|
1709 |
+ const struct trie_node_dot_repeat* dot_rep_node = container_of_const(node, const struct trie_node_dot_repeat, node); |
|
1710 |
+ break; |
|
1711 |
+ } |
|
1712 |
+ case OP_CHAR_CLASS_REPEAT: |
|
1713 |
+ { |
|
1714 |
+ const struct trie_node_char_class_repeat* class_rep_node = container_of_const(node, const struct trie_node_char_class_repeat, node); |
|
1715 |
+ break; |
|
1716 |
+ } |
|
1717 |
+ case OP_STRCMP: |
|
1718 |
+ { |
|
1719 |
+ const struct trie_node_strcmp* strcmp_node = container_of_const(node, const struct trie_node_strcmp, node); |
|
1720 |
+ break; |
|
1721 |
+ } |
|
1722 |
+ case OP_GROUP_START: |
|
1723 |
+ { |
|
1724 |
+ const struct trie_node_group_start* group_start_node = container_of_const(node, const struct trie_node_group_start, node); |
|
1725 |
+ break; |
|
1726 |
+ } |
|
1727 |
+ case OP_GROUP_END: |
|
1728 |
+ { |
|
1729 |
+ const struct trie_node_group_end* group_end_node = container_of_const(node, const struct trie_node_group_end, node); |
|
1730 |
+ break; |
|
1731 |
+ } |
|
1732 |
+ default: |
|
1733 |
+ { |
|
1734 |
+ cli_warnmsg("Unimplemented node type:%d", node->type); |
|
1735 |
+ return 0; |
|
1736 |
+ break; |
|
1737 |
+ } |
|
1738 |
+ } |
|
1739 |
+ } |
|
1740 |
+ return 1;/* match */ |
|
1741 |
+} |
|
1742 |
+ |
|
1743 |
+#endif |
|
1744 |
+ |