Browse code

Increase limit for finding PE files embedded in other PE files

I am seeing missed detections since we changed to prohibit embedded
file type identification when inside an embedded file.
In particular, I'm seeing this issue with PE files that contain multiple
other MSEXE as well as a variety of false positives for PE file headers.

For example, imagine a PE with four concatenated DLL's, like so:
```
[ EXE file | DLL #1 | DLL #2 | DLL #3 | DLL #4 ]
```

And note that false positives for embedded MSEXE files are fairly common.
So there may be a few mixed in there.

Before limiting embedded file identification we might interpret the file
structure something like this:
```
MSEXE: {
embedded MSEXE #1: false positive,
embedded MSEXE #2: false positive,
embedded MSEXE #3: false positive,
embedded MSEXE #4: DLL #1: {
embedded MSEXE #1: false positive,
embedded MSEXE #2: DLL #2: {
embedded MSEXE #1: DLL #3: {
embedded MSEXE #1: false positive,
embedded MSEXE #2: false positive,
embedded MSEXE #3: false positive,
embedded MSEXE #4: false positive,
embedded MSEXE #5: DLL #4
}
embedded MSEXE #2: false positive,
embedded MSEXE #3: false positive,
embedded MSEXE #4: false positive,
embedded MSEXE #5: false positive,
embedded MSEXE #6: DLL #4
}
embedded MSEXE #3: DLL #3,
embedded MSEXE #4: false positive,
embedded MSEXE #5: false positive,
embedded MSEXE #6: false positive,
embedded MSEXE #7: false positive,
embedded MSEXE #8: DLL #4
}
}
```

This is obviously terrible, which is why why we don't allow detecting
embedded files within other embedded files.
So after we enforce that limit, the same file may be interpreted like
this instead:
```
MSEXE: {
embedded MSEXE #1: false positive,
embedded MSEXE #2: false positive,
embedded MSEXE #3: false positive,
embedded MSEXE #4: DLL #1,
embedded MSEXE #5: false positive,
embedded MSEXE #6: DLL #2,
embedded MSEXE #7: DLL #3,
embedded MSEXE #8: false positive,
embedded MSEXE #9: false positive,
embedded MSEXE #10: false positive,
embedded MSEXE #11: false positive,
embedded MSEXE #12: DLL #4
}
```

That's great! Except that we now exceed the "MAX_EMBEDDED_OBJ" limit
for embedded type matches (limit 10, but 12 found). That means we won't
see or extract the 4th DLL anymore.

My solution is to lift the limit when adding an matched MSEXE type.
We already do this for matched ZIPSFX types.
While doing this, I've significantly tidied up the limits checks to
make it more readble, and removed duplicate checks from within the
`ac_addtype()` function.

CLAM-2897

Val S. authored on 2025/10/13 05:05:17
Showing 1 changed files
... ...
@@ -833,8 +833,9 @@ int cli_ac_chklsig(const char *expr, const char *end, uint32_t *lsigcnt, unsigne
833 833
                     return -1;
834 834
                 }
835 835
 
836
-                for (i += 2; i + 1 < len && (isdigit(expr[i + 1]) || expr[i + 1] == ','); i++)
837
-                    ;
836
+                for (i += 2; i + 1 < len && (isdigit(expr[i + 1]) || expr[i + 1] == ','); i++) {
837
+                    continue;
838
+                }
838 839
             }
839 840
 
840 841
             if (&expr[i + 1] == rend)
... ...
@@ -1625,19 +1626,23 @@ void cli_ac_freedata(struct cli_ac_data *data)
1625 1625
     }
1626 1626
 }
1627 1627
 
1628
-/* returns only CL_SUCCESS or CL_EMEM */
1629
-inline static int ac_addtype(struct cli_matched_type **list, cli_file_t type, off_t offset, const cli_ctx *ctx)
1628
+/**
1629
+ * @brief Add a match for an object type to the list of matched types.
1630
+ *
1631
+ * Important: The caller is responsible for checking limits!
1632
+ *
1633
+ * @param list Pointer to the list of matched types. *list may be NULL if no types have been added yet.
1634
+ * @param type The type of the embedded object.
1635
+ * @param offset The offset of the embedded object.
1636
+ * @param ctx The context information. May be NULL.
1637
+ * @return cl_error_t CL_SUCCESS regardless if added, or CL_EMEM if memory allocation failed.
1638
+ */
1639
+inline static cl_error_t ac_addtype(struct cli_matched_type **list, cli_file_t type, off_t offset, const cli_ctx *ctx)
1630 1640
 {
1631
-    struct cli_matched_type *tnode, *tnode_last;
1641
+    struct cli_matched_type *tnode;
1632 1642
 
1633
-    if (type == CL_TYPE_ZIPSFX) {
1634
-        if (*list && ctx && ctx->engine->maxfiles && (*list)->cnt > ctx->engine->maxfiles)
1635
-            return CL_SUCCESS;
1636
-    } else if (*list && (*list)->cnt >= MAX_EMBEDDED_OBJ) {
1637
-        return CL_SUCCESS;
1638
-    }
1639
-
1640
-    if (!(tnode = calloc(1, sizeof(struct cli_matched_type)))) {
1643
+    tnode = calloc(1, sizeof(struct cli_matched_type));
1644
+    if (NULL == tnode) {
1641 1645
         cli_errmsg("cli_ac_addtype: Can't allocate memory for new type node\n");
1642 1646
         return CL_EMEM;
1643 1647
     }
... ...
@@ -1645,16 +1650,25 @@ inline static int ac_addtype(struct cli_matched_type **list, cli_file_t type, of
1645 1645
     tnode->type   = type;
1646 1646
     tnode->offset = offset;
1647 1647
 
1648
-    tnode_last = *list;
1649
-    while (tnode_last && tnode_last->next)
1650
-        tnode_last = tnode_last->next;
1648
+    if (*list) {
1649
+        // Add to end of existing list.
1650
+        struct cli_matched_type *tnode_last = *list;
1651 1651
 
1652
-    if (tnode_last)
1652
+        while (tnode_last && tnode_last->next) {
1653
+            tnode_last = tnode_last->next;
1654
+        }
1653 1655
         tnode_last->next = tnode;
1654
-    else
1656
+    } else {
1657
+        // First type in the list.
1655 1658
         *list = tnode;
1659
+    }
1656 1660
 
1657 1661
     (*list)->cnt++;
1662
+
1663
+    if (UNLIKELY(cli_get_debug_flag())) {
1664
+        cli_dbgmsg("ac_addtype: added %s embedded object at offset " STDi64 ". Embedded object count: %d\n", cli_ftname(type), (uint64_t)offset, (*list)->cnt);
1665
+    }
1666
+
1658 1667
     return CL_SUCCESS;
1659 1668
 }
1660 1669
 
... ...
@@ -1999,14 +2013,65 @@ cl_error_t cli_ac_scanbuff(
1999 1999
 
2000 2000
                                         cli_dbgmsg("Matched signature for file type %s\n", pt->virname);
2001 2001
                                         type = pt->type;
2002
-                                        if ((ftoffset != NULL) &&
2003
-                                            ((*ftoffset == NULL) || (*ftoffset)->cnt < MAX_EMBEDDED_OBJ || type == CL_TYPE_ZIPSFX) && (type >= CL_TYPE_SFX || ((ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP || ftype == CL_TYPE_MSOLE2) && type == CL_TYPE_MSEXE))) {
2004
-                                            /* FIXME: the first offset in the array is most likely the correct one but
2005
-                                             * it may happen it is not
2006
-                                             */
2007
-                                            for (j = 1; j <= CLI_DEFAULT_AC_TRACKLEN + 1 && offmatrix[0][j] != (uint32_t)-1; j++)
2008
-                                                if (ac_addtype(ftoffset, type, offmatrix[pt->parts - 1][j], ctx))
2009
-                                                    return CL_EMEM;
2002
+
2003
+                                        if (ftoffset != NULL) {
2004
+                                            // Caller provided a pointer to record matched types.
2005
+                                            bool too_many_types = false;
2006
+                                            bool supported_type = false;
2007
+
2008
+                                            if (*ftoffset != NULL) {
2009
+                                                // Have some type matches already. Check limits.
2010
+
2011
+                                                if (ctx && ((type == CL_TYPE_ZIPSFX) ||
2012
+                                                            (type == CL_TYPE_MSEXE && ftype == CL_TYPE_MSEXE))) {
2013
+                                                    // When ctx present, limit the number of type matches using ctx->engine->maxfiles for specific types.
2014
+                                                    // Reasoning:
2015
+                                                    //   ZIP local file header entries likely to be numerous if a single ZIP appended to the scanned file.
2016
+                                                    //   MSEXE can contain many embedded MSEXE entries and MSEXE type false positives matches.
2017
+
2018
+                                                    if (ctx->engine->maxfiles == 0) {
2019
+                                                        // Max-files limit is disabled.
2020
+                                                    } else if ((*ftoffset)->cnt >= ctx->engine->maxfiles) {
2021
+                                                        if (UNLIKELY(cli_get_debug_flag())) {
2022
+                                                            cli_dbgmsg("ac_addtype: Can't add %s type at offset " STDu64 " to list of embedded type matches. Reached maxfiles limit of %u\n", cli_ftname(type), (*ftoffset)->offset, ctx->engine->maxfiles);
2023
+                                                        }
2024
+                                                        too_many_types = true;
2025
+                                                    }
2026
+                                                } else {
2027
+                                                    // Limit the number of type matches using MAX_EMBEDDED_OBJ.
2028
+                                                    if ((*ftoffset)->cnt >= MAX_EMBEDDED_OBJ) {
2029
+                                                        if (UNLIKELY(cli_get_debug_flag())) {
2030
+                                                            cli_dbgmsg("ac_addtype: Can't add %s type at offset " STDu64 " to list of embedded type matches. Reached MAX_EMBEDDED_OBJ limit of %u\n", cli_ftname(type), (*ftoffset)->offset, MAX_EMBEDDED_OBJ);
2031
+                                                        }
2032
+                                                        too_many_types = true;
2033
+                                                    }
2034
+                                                }
2035
+                                            }
2036
+
2037
+                                            // Filter to supported types.
2038
+                                            if (
2039
+                                                // Found type is MBR.
2040
+                                                type == CL_TYPE_MBR ||
2041
+                                                // Found type is any SFX type (i.e., ZIPSFX, RARSFX, 7ZSSFX, etc.).
2042
+                                                type >= CL_TYPE_SFX ||
2043
+                                                // Found type is an MSEXE, but only if host file type is one of MSEXE, ZIP, or MSOLE2.
2044
+                                                (type == CL_TYPE_MSEXE && (ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP || ftype == CL_TYPE_MSOLE2))) {
2045
+
2046
+                                                supported_type = true;
2047
+                                            }
2048
+
2049
+                                            if (supported_type && !too_many_types) {
2050
+                                                /* FIXME: the first offset in the array is most likely the correct one but
2051
+                                                 * it may happen it is not
2052
+                                                 * Until we're certain and can fix this, we add all offsets in the list.
2053
+                                                 */
2054
+                                                for (j = 1; j <= CLI_DEFAULT_AC_TRACKLEN + 1 && offmatrix[0][j] != (uint32_t)-1; j++) {
2055
+                                                    ret = ac_addtype(ftoffset, type, offmatrix[pt->parts - 1][j], ctx);
2056
+                                                    if (CL_SUCCESS != ret) {
2057
+                                                        return ret;
2058
+                                                    }
2059
+                                                }
2060
+                                            }
2010 2061
                                         }
2011 2062
 
2012 2063
                                         memset(offmatrix[0], (uint32_t)-1, pt->parts * (CLI_DEFAULT_AC_TRACKLEN + 2) * sizeof(uint32_t));
... ...
@@ -2066,11 +2131,59 @@ cl_error_t cli_ac_scanbuff(
2066 2066
 
2067 2067
                                     cli_dbgmsg("Matched signature for file type %s at %u\n", pt->virname, realoff);
2068 2068
                                     type = pt->type;
2069
-                                    if ((ftoffset != NULL) &&
2070
-                                        ((*ftoffset == NULL) || (*ftoffset)->cnt < MAX_EMBEDDED_OBJ || type == CL_TYPE_ZIPSFX) && (type == CL_TYPE_MBR || type >= CL_TYPE_SFX || ((ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP || ftype == CL_TYPE_MSOLE2) && type == CL_TYPE_MSEXE))) {
2071 2069
 
2072
-                                        if (ac_addtype(ftoffset, type, realoff, ctx))
2073
-                                            return CL_EMEM;
2070
+                                    if (ftoffset != NULL) {
2071
+                                        // Caller provided a pointer to record matched types.
2072
+                                        bool too_many_types = false;
2073
+                                        bool supported_type = false;
2074
+
2075
+                                        if (*ftoffset != NULL) {
2076
+                                            // Have some type matches already. Check limits.
2077
+
2078
+                                            if (ctx && ((type == CL_TYPE_ZIPSFX) ||
2079
+                                                        (type == CL_TYPE_MSEXE && ftype == CL_TYPE_MSEXE))) {
2080
+                                                // When ctx present, limit the number of type matches using ctx->engine->maxfiles for specific types.
2081
+                                                // Reasoning:
2082
+                                                //   ZIP local file header entries likely to be numerous if a single ZIP appended to the scanned file.
2083
+                                                //   MSEXE can contain many embedded MSEXE entries and MSEXE type false positives matches.
2084
+
2085
+                                                if (ctx->engine->maxfiles == 0) {
2086
+                                                    // Max-files limit is disabled.
2087
+                                                } else if ((*ftoffset)->cnt >= ctx->engine->maxfiles) {
2088
+                                                    if (UNLIKELY(cli_get_debug_flag())) {
2089
+                                                        cli_dbgmsg("ac_addtype: Can't add %s type at offset " STDu64 " to list of embedded type matches. Reached maxfiles limit of %u\n", cli_ftname(type), (*ftoffset)->offset, ctx->engine->maxfiles);
2090
+                                                    }
2091
+                                                    too_many_types = true;
2092
+                                                }
2093
+                                            } else {
2094
+                                                // Limit the number of type matches using MAX_EMBEDDED_OBJ.
2095
+                                                if ((*ftoffset)->cnt >= MAX_EMBEDDED_OBJ) {
2096
+                                                    if (UNLIKELY(cli_get_debug_flag())) {
2097
+                                                        cli_dbgmsg("ac_addtype: Can't add %s type at offset " STDu64 " to list of embedded type matches. Reached MAX_EMBEDDED_OBJ limit of %u\n", cli_ftname(type), (*ftoffset)->offset, MAX_EMBEDDED_OBJ);
2098
+                                                    }
2099
+                                                    too_many_types = true;
2100
+                                                }
2101
+                                            }
2102
+                                        }
2103
+
2104
+                                        // Filter to supported types.
2105
+                                        if (
2106
+                                            // Found type is MBR.
2107
+                                            type == CL_TYPE_MBR ||
2108
+                                            // Found type is any SFX type (i.e., ZIPSFX, RARSFX, 7ZSSFX, etc.).
2109
+                                            type >= CL_TYPE_SFX ||
2110
+                                            // Found type is an MSEXE, but only if host file type is one of MSEXE, ZIP, or MSOLE2.
2111
+                                            (type == CL_TYPE_MSEXE && (ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP || ftype == CL_TYPE_MSOLE2))) {
2112
+
2113
+                                            supported_type = true;
2114
+                                        }
2115
+
2116
+                                        if (supported_type && !too_many_types) {
2117
+                                            ret = ac_addtype(ftoffset, type, realoff, ctx);
2118
+                                            if (CL_SUCCESS != ret) {
2119
+                                                return ret;
2120
+                                            }
2121
+                                        }
2074 2122
                                     }
2075 2123
                                 }
2076 2124
                             } else {