Browse code

Fix integer overflow in PDF parser

The ascii85decode function calculates the amount of memory to reserve as
a function of (4 * bytes) + 1. Since the result is stored in a uint32_t,
we need to make sure that this calculation will not overflow. If we
detect that an overflow would occur, return CL_EFORMAT and do not
proceed.

Also check additional potential overflow conditions.
Other areas were identified that could potentially overflow.
This commit adds additional checks to prevent said overflows.

Thank you Greg Walkup at Sandia National Labs for reporting this issue.

CLAM-2752
CLAM-2757
CLAM-2759

John Humlick authored on 2025/04/22 08:18:07
Showing 3 changed files
... ...
@@ -441,7 +441,7 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm,
441 441
 
442 442
         if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
443 443
             /* Failed to find obj offset for next obj */
444
-            cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next obj offset for obj in object stream though there should be {%u} more.\n", objstm->n - objstm->nobjs_found);
444
+            cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next obj offset for obj in object stream though there should be {%zu} more.\n", objstm->n - objstm->nobjs_found);
445 445
             status = CL_EPARSE;
446 446
             goto done;
447 447
         } else if (temp_long < 0) {
... ...
@@ -1555,18 +1555,18 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
1555 1555
                 }
1556 1556
             }
1557 1557
 
1558
-            cli_dbgmsg("pdf_extract_obj: calculated length %lld\n", (long long)length);
1558
+            cli_dbgmsg("pdf_extract_obj: calculated length %zu\n", length);
1559 1559
         } else {
1560 1560
             if (obj->stream_size > (size_t)length + 2) {
1561 1561
                 cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n",
1562
-                           (size_t)length, obj->stream_size);
1562
+                           length, obj->stream_size);
1563 1563
                 length = obj->stream_size;
1564 1564
             }
1565 1565
         }
1566 1566
 
1567
-        if ((0 != orig_length) && (obj->stream_size > (size_t)orig_length + 20)) {
1568
-            cli_dbgmsg("pdf_extract_obj: orig length: %lld, length: %lld, size: %zu\n",
1569
-                       (long long)orig_length, (long long)length, obj->stream_size);
1567
+        if ((0 != orig_length) && (obj->stream_size > orig_length + 20)) {
1568
+            cli_dbgmsg("pdf_extract_obj: orig length: %zu, length: %zu, size: %zu\n",
1569
+                       orig_length, length, obj->stream_size);
1570 1570
             pdfobj_flag(pdf, obj, BAD_STREAMLEN);
1571 1571
         }
1572 1572
 
... ...
@@ -1620,18 +1620,18 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
1620 1620
          */
1621 1621
         dict_len = obj->stream - start;
1622 1622
         if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm"))) {
1623
-            int32_t objstm_first  = -1;
1624
-            int32_t objstm_length = -1;
1625
-            int32_t objstm_n      = -1;
1623
+            int objstm_first  = -1;
1624
+            int objstm_length = -1;
1625
+            int objstm_n      = -1;
1626 1626
 
1627 1627
             cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n");
1628 1628
 
1629 1629
             dict_len = obj->stream - start;
1630
-            if ((-1 == (objstm_first = pdf_readint(start, dict_len, "/First")))) {
1630
+            if (-1 == (objstm_first = pdf_readint(start, dict_len, "/First"))) {
1631 1631
                 cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n");
1632
-            } else if ((-1 == (objstm_length = pdf_readint(start, dict_len, "/Length")))) {
1632
+            } else if (-1 == (objstm_length = pdf_readint(start, dict_len, "/Length"))) {
1633 1633
                 cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n");
1634
-            } else if ((-1 == (objstm_n = pdf_readint(start, dict_len, "/N")))) {
1634
+            } else if (-1 == (objstm_n = pdf_readint(start, dict_len, "/N"))) {
1635 1635
                 cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n");
1636 1636
             } else {
1637 1637
                 /* Add objstm to pdf struct, so it can be freed eventually */
... ...
@@ -1653,19 +1653,19 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
1653 1653
 
1654 1654
                 memset(objstm, 0, sizeof(*objstm));
1655 1655
 
1656
-                objstm->first        = (uint32_t)objstm_first;
1657
-                objstm->current      = (uint32_t)objstm_first;
1656
+                objstm->first        = (size_t)objstm_first;
1657
+                objstm->current      = (size_t)objstm_first;
1658 1658
                 objstm->current_pair = 0;
1659
-                objstm->length       = (uint32_t)objstm_length;
1660
-                objstm->n            = (uint32_t)objstm_n;
1659
+                objstm->length       = (size_t)objstm_length;
1660
+                objstm->n            = (size_t)objstm_n;
1661 1661
 
1662
-                cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %d\n", objstm->first);
1663
-                cli_dbgmsg("pdf_extract_obj: ObjStm length is %d bytes\n", objstm->length);
1664
-                cli_dbgmsg("pdf_extract_obj: ObjStm should contain %d objects\n", objstm->n);
1662
+                cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %zu\n", objstm->first);
1663
+                cli_dbgmsg("pdf_extract_obj: ObjStm length is %zu bytes\n", objstm->length);
1664
+                cli_dbgmsg("pdf_extract_obj: ObjStm should contain %zu objects\n", objstm->n);
1665 1665
             }
1666 1666
         }
1667 1667
 
1668
-        sum = pdf_decodestream(pdf, obj, dparams, obj->stream, (uint32_t)length, xref, fout, &rc, objstm);
1668
+        sum = pdf_decodestream(pdf, obj, dparams, obj->stream, length, xref, fout, &rc, objstm);
1669 1669
         if ((CL_SUCCESS != rc) && (CL_VIRUS != rc)) {
1670 1670
             cli_dbgmsg("Error decoding stream! Error code: %d\n", rc);
1671 1671
 
... ...
@@ -3351,7 +3351,7 @@ cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objs
3351 3351
         retval = pdf_findobj_in_objstm(pdf, objstm, &obj);
3352 3352
         if (retval != CL_SUCCESS) {
3353 3353
             if (retval != CL_BREAK) {
3354
-                cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Fewer objects in stream than expected: %u found, %u expected.\n",
3354
+                cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Fewer objects in stream than expected: %zu found, %zu expected.\n",
3355 3355
                            objstm->nobjs_found, objstm->n);
3356 3356
                 badobjects++;
3357 3357
                 pdf->stats.ninvalidobjs++;
... ...
@@ -27,14 +27,14 @@
27 27
 #define PDF_OBJECT_RECURSION_LIMIT 25
28 28
 
29 29
 struct objstm_struct {
30
-    uint32_t first;        // offset of first obj
31
-    uint32_t current;      // offset of current obj
32
-    uint32_t current_pair; // offset of current pair describing id, location of object
33
-    uint32_t length;       // total length of all objects (starting at first)
34
-    uint32_t n;            // number of objects that should be found in the object stream
35
-    uint32_t nobjs_found;  // number of objects actually found in the object stream
36
-    char *streambuf;       // address of stream buffer, beginning with first obj pair
37
-    size_t streambuf_len;  // length of stream buffer, includes pairs followed by actual objects
30
+    size_t first;         // offset of first obj
31
+    size_t current;       // offset of current obj
32
+    size_t current_pair;  // offset of current pair describing id, location of object
33
+    size_t length;        // total length of all objects (starting at first)
34
+    size_t n;             // number of objects that should be found in the object stream
35
+    size_t nobjs_found;   // number of objects actually found in the object stream
36
+    char *streambuf;      // address of stream buffer, beginning with first obj pair
37
+    size_t streambuf_len; // length of stream buffer, includes pairs followed by actual objects
38 38
 };
39 39
 
40 40
 struct pdf_obj {
... ...
@@ -73,7 +73,7 @@
73 73
 struct pdf_token {
74 74
     uint32_t flags;   /* tracking flags */
75 75
     uint32_t success; /* successfully decoded filters */
76
-    uint32_t length;  /* length of current content; TODO: transition to size_t */
76
+    size_t length;    /* length of current content; TODO: transition to size_t */
77 77
     uint8_t *content; /* content stream */
78 78
 };
79 79
 
... ...
@@ -448,10 +448,16 @@ static cl_error_t filter_ascii85decode(struct pdf_struct *pdf, struct pdf_obj *o
448 448
     uint32_t declen = 0;
449 449
 
450 450
     const uint8_t *ptr = (uint8_t *)token->content;
451
-    uint32_t remaining = token->length;
451
+    size_t remaining   = token->length;
452 452
     int quintet = 0, rc = CL_SUCCESS;
453 453
     uint64_t sum = 0;
454 454
 
455
+    /* Check for overflow */
456
+    if (remaining > (SIZE_MAX / 4)) {
457
+        cli_dbgmsg("cli_pdf: ascii85decode: overflow detected\n");
458
+        return CL_EFORMAT;
459
+    }
460
+
455 461
     /* 5:4 decoding ratio, with 1:4 expansion sequences => (4*length)+1 */
456 462
     if (!(dptr = decoded = (uint8_t *)cli_malloc((4 * remaining) + 1))) {
457 463
         cli_errmsg("cli_pdf: cannot allocate memory for decoded output\n");
... ...
@@ -838,8 +844,8 @@ static cl_error_t filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *
838 838
     uint8_t *decoded;
839 839
 
840 840
     const uint8_t *content = (uint8_t *)token->content;
841
-    uint32_t length        = token->length;
842
-    uint32_t i, j;
841
+    size_t length          = token->length;
842
+    size_t i, j;
843 843
     cl_error_t rc = CL_SUCCESS;
844 844
 
845 845
     if (!(decoded = (uint8_t *)cli_calloc(length / 2 + 1, sizeof(uint8_t)))) {
... ...
@@ -869,8 +875,8 @@ static cl_error_t filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *
869 869
     if (rc == CL_SUCCESS) {
870 870
         free(token->content);
871 871
 
872
-        cli_dbgmsg("cli_pdf: deflated %lu bytes from %lu total bytes\n",
873
-                   (unsigned long)j, (unsigned long)(token->length));
872
+        cli_dbgmsg("cli_pdf: deflated %zu bytes from %zu total bytes\n",
873
+                   j, token->length);
874 874
 
875 875
         token->content = decoded;
876 876
         token->length  = j;
... ...
@@ -878,8 +884,8 @@ static cl_error_t filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *
878 878
         if (!(obj->flags & ((1 << OBJ_IMAGE) | (1 << OBJ_TRUNCATED))))
879 879
             pdfobj_flag(pdf, obj, BAD_ASCIIDECODE);
880 880
 
881
-        cli_dbgmsg("cli_pdf: error occurred parsing byte %lu of %lu\n",
882
-                   (unsigned long)i, (unsigned long)(token->length));
881
+        cli_dbgmsg("cli_pdf: error occurred parsing byte %zu of %zu\n",
882
+                   i, token->length);
883 883
         free(decoded);
884 884
     }
885 885
     return rc;
... ...
@@ -920,27 +926,29 @@ static cl_error_t filter_decrypt(struct pdf_struct *pdf, struct pdf_obj *obj, st
920 920
         return CL_EPARSE; /* TODO: what should this value be? CL_SUCCESS would mirror previous behavior */
921 921
     }
922 922
 
923
-    cli_dbgmsg("cli_pdf: decrypted %zu bytes from %u total bytes\n",
923
+    cli_dbgmsg("cli_pdf: decrypted %zu bytes from %zu total bytes\n",
924 924
                length, token->length);
925 925
 
926 926
     free(token->content);
927 927
     token->content = (uint8_t *)decrypted;
928
-    token->length  = (uint32_t)length; /* this may truncate unfortunately, TODO: use 64-bit values internally? */
928
+    token->length  = length;
929 929
     return CL_SUCCESS;
930 930
 }
931 931
 
932 932
 static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token)
933 933
 {
934 934
     uint8_t *decoded, *temp;
935
-    uint32_t declen = 0, capacity = 0;
935
+    size_t declen = 0, capacity = 0;
936 936
 
937 937
     uint8_t *content = (uint8_t *)token->content;
938 938
     uint32_t length  = token->length;
939 939
     lzw_stream stream;
940 940
     int echg = 1, lzwstat, rc = CL_SUCCESS;
941 941
 
942
-    if (pdf->ctx && !(pdf->ctx->dconf->other & OTHER_CONF_LZW))
943
-        return CL_BREAK;
942
+    if (pdf->ctx && !(pdf->ctx->dconf->other & OTHER_CONF_LZW)) {
943
+        rc = CL_BREAK;
944
+        goto done;
945
+    }
944 946
 
945 947
     if (params) {
946 948
         struct pdf_dict_node *node = params->nodes;
... ...
@@ -971,15 +979,18 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
971 971
          * Sample 0015315109, it has \r followed by zlib header.
972 972
          * Flag pdf as suspicious, and attempt to extract by skipping the \r.
973 973
          */
974
-        if (!length)
975
-            return CL_SUCCESS;
974
+        if (!length) {
975
+            rc = CL_SUCCESS;
976
+            goto done;
977
+        }
976 978
     }
977 979
 
978 980
     capacity = INFLATE_CHUNK_SIZE;
979 981
 
980 982
     if (!(decoded = (uint8_t *)cli_malloc(capacity))) {
981 983
         cli_errmsg("cli_pdf: cannot allocate memory for decoded output\n");
982
-        return CL_EMEM;
984
+        rc = CL_EMEM;
985
+        goto done;
983 986
     }
984 987
 
985 988
     memset(&stream, 0, sizeof(stream));
... ...
@@ -994,7 +1005,8 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
994 994
     if (lzwstat != Z_OK) {
995 995
         cli_warnmsg("cli_pdf: lzwInit failed\n");
996 996
         free(decoded);
997
-        return CL_EMEM;
997
+        rc = CL_EMEM;
998
+        goto done;
998 999
     }
999 1000
 
1000 1001
     /* initial inflate */
... ...
@@ -1009,16 +1021,23 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
1009 1009
             length -= q - content;
1010 1010
             content = q;
1011 1011
 
1012
-            stream.next_in   = (Bytef *)content;
1013
-            stream.avail_in  = length;
1014
-            stream.next_out  = (Bytef *)decoded;
1012
+            stream.next_in  = (Bytef *)content;
1013
+            stream.avail_in = length;
1014
+            stream.next_out = (Bytef *)decoded;
1015
+            /* Make sure we don't overflow during type conversion */
1016
+            if (capacity > UINT_MAX) {
1017
+                cli_dbgmsg("cli_pdf: lzwdecode: overflow detected\n");
1018
+                rc = CL_EFORMAT;
1019
+                goto done;
1020
+            }
1015 1021
             stream.avail_out = capacity;
1016 1022
 
1017 1023
             lzwstat = lzwInit(&stream);
1018 1024
             if (lzwstat != Z_OK) {
1019 1025
                 cli_warnmsg("cli_pdf: lzwInit failed\n");
1020 1026
                 free(decoded);
1021
-                return CL_EMEM;
1027
+                rc = CL_EMEM;
1028
+                goto done;
1022 1029
             }
1023 1030
 
1024 1031
             pdfobj_flag(pdf, obj, BAD_FLATESTART);
... ...
@@ -1031,7 +1050,7 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
1031 1031
         /* extend output capacity if needed,*/
1032 1032
         if (stream.avail_out == 0) {
1033 1033
             if ((rc = cli_checklimits("pdf", pdf->ctx, capacity + INFLATE_CHUNK_SIZE, 0, 0)) != CL_SUCCESS) {
1034
-                cli_dbgmsg("cli_pdf: required buffer size to inflate compressed filter exceeds maximum: %u\n", capacity + INFLATE_CHUNK_SIZE);
1034
+                cli_dbgmsg("cli_pdf: required buffer size to inflate compressed filter exceeds maximum: %zu\n", capacity + INFLATE_CHUNK_SIZE);
1035 1035
                 break;
1036 1036
             }
1037 1037
 
... ...
@@ -1043,7 +1062,17 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
1043 1043
             decoded          = temp;
1044 1044
             stream.next_out  = decoded + capacity;
1045 1045
             stream.avail_out = INFLATE_CHUNK_SIZE;
1046
+            if (declen > (SIZE_MAX - INFLATE_CHUNK_SIZE)) {
1047
+                cli_dbgmsg("cli_pdf: lzwdecode: overflow detected\n");
1048
+                rc = CL_EFORMAT;
1049
+                goto done;
1050
+            }
1046 1051
             declen += INFLATE_CHUNK_SIZE;
1052
+            if (capacity > (SIZE_MAX - INFLATE_CHUNK_SIZE)) {
1053
+                cli_dbgmsg("cli_pdf: lzwdecode: overflow detected\n");
1054
+                rc = CL_EFORMAT;
1055
+                goto done;
1056
+            }
1047 1057
             capacity += INFLATE_CHUNK_SIZE;
1048 1058
         }
1049 1059
 
... ...
@@ -1051,6 +1080,12 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
1051 1051
         lzwstat = lzwInflate(&stream);
1052 1052
     }
1053 1053
 
1054
+    if (declen > (UINT32_MAX - (INFLATE_CHUNK_SIZE - stream.avail_out))) {
1055
+        cli_dbgmsg("cli_pdf: lzwdecode: overflow detected\n");
1056
+        rc = CL_EFORMAT;
1057
+        goto done;
1058
+    }
1059
+
1054 1060
     /* add stream end fragment to decoded length */
1055 1061
     declen += (INFLATE_CHUNK_SIZE - stream.avail_out);
1056 1062
 
... ...
@@ -1091,6 +1126,7 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
1091 1091
 
1092 1092
     (void)lzwInflateEnd(&stream);
1093 1093
 
1094
+done:
1094 1095
     if (rc == CL_SUCCESS) {
1095 1096
         if (declen == 0) {
1096 1097
             cli_dbgmsg("cli_pdf: empty stream after inflation completed.\n");