Browse code

pdfdecode: add dictionary heuristic and all-match support

Kevin Lin authored on 2016/04/15 05:42:51
Showing 4 changed files
... ...
@@ -67,7 +67,8 @@
67 67
  * strings SHOULD range from 9 to 12 bits.
68 68
  */
69 69
 #define BITS_MIN    9       /* start with 9 bits */
70
-#define BITS_MAX    13      /* max of 12 bit strings, +1 for robustness */
70
+#define BITS_VALID  12      /* max of 12 bit strings are valid, used to flag  */
71
+#define BITS_MAX    14      /* max of 12 bit strings, +2 for robustness */
71 72
 /* predefined codes */
72 73
 #define CODE_BASIC  256     /* last basic code + 1 */
73 74
 #define CODE_CLEAR  256     /* code to clear string table */
... ...
@@ -94,7 +95,6 @@ struct lzw_internal_state {
94 94
     uint16_t    nbits;      /* # of bits/code */
95 95
     long        nextdata;   /* next bits of i/o */
96 96
     long        nextbits;   /* # of valid bits in lzw_nextdata */
97
-    uint32_t    flags;      /* flags affecting decompression */
98 97
 
99 98
     /* decoding-specific state */
100 99
     long    dec_nbitsmask;  /* lzw_nbits 1 bits, right adjusted */
... ...
@@ -146,7 +146,7 @@ break;                                                    \
146 146
     oldcodep = state->dec_codetab + code;                               \
147 147
 }
148 148
 
149
-int lzwInit(lzw_streamp strm, uint32_t flags)
149
+int lzwInit(lzw_streamp strm)
150 150
 {
151 151
     struct lzw_internal_state *sp;
152 152
     hcode_t code;
... ...
@@ -161,7 +161,6 @@ int lzwInit(lzw_streamp strm, uint32_t flags)
161 161
     sp->nbits = BITS_MIN;
162 162
     sp->nextdata = 0;
163 163
     sp->nextbits = 0;
164
-    sp->flags = flags;
165 164
 
166 165
     /* dictionary setup */
167 166
     sp->dec_codetab = cli_calloc(CSIZE, sizeof(code_t));
... ...
@@ -200,19 +199,22 @@ int lzwInflate(lzw_streamp strm)
200 200
     uint8_t *wp;
201 201
     hcode_t code, free_code;
202 202
     int echg, ret = LZW_OK;
203
+    uint32_t flags;
203 204
 
204 205
     if (strm == NULL || strm->state == NULL || strm->next_out == NULL ||
205 206
         (strm->next_in == NULL && strm->avail_in != 0))
206 207
         return LZW_STREAM_ERROR;
207 208
 
208 209
     /* load state */
209
-    state = strm->state;
210 210
     to = strm->next_out;
211 211
     out = left = strm->avail_out;
212 212
 
213 213
     from = strm->next_in;
214 214
     in = have = strm->avail_in;
215 215
 
216
+    flags = strm->flags;
217
+    state = strm->state;
218
+
216 219
     nbits = state->nbits;
217 220
     nextdata = state->nextdata;
218 221
     nextbits = state->nextbits;
... ...
@@ -221,7 +223,7 @@ int lzwInflate(lzw_streamp strm)
221 221
     free_entp = state->dec_free_entp;
222 222
     maxcodep = state->dec_maxcodep;
223 223
 
224
-    echg = state->flags & LZW_FLAG_EARLYCHG;
224
+    echg = flags & LZW_FLAG_EARLYCHG;
225 225
     free_code = free_entp - &state->dec_codetab[0];
226 226
 
227 227
     if (oldcodep == &state->dec_codetab[CODE_EOI])
... ...
@@ -289,8 +291,11 @@ int lzwInflate(lzw_streamp strm)
289 289
 
290 290
         /* non-earlychange bit expansion */
291 291
         if (!echg && free_entp > maxcodep) {
292
-            if (++nbits > BITS_MAX)     /* should not happen */
293
-                nbits = BITS_MAX;
292
+            if (++nbits > BITS_VALID) {
293
+                flags |= LZW_FLAG_BIGDICT;
294
+                if (nbits > BITS_MAX)     /* should not happen */
295
+                    nbits = BITS_MAX;
296
+            }
294 297
             nbitsmask = MAXCODE(nbits);
295 298
             maxcodep = state->dec_codetab + nbitsmask-1;
296 299
         }
... ...
@@ -311,8 +316,11 @@ int lzwInflate(lzw_streamp strm)
311 311
         free_entp++;
312 312
         /* earlychange bit expansion */
313 313
         if (echg && free_entp > maxcodep) {
314
-            if (++nbits > BITS_MAX)     /* should not happen */
315
-                nbits = BITS_MAX;
314
+            if (++nbits > BITS_VALID) {
315
+                flags |= LZW_FLAG_BIGDICT;
316
+                if (nbits > BITS_MAX)     /* should not happen */
317
+                    nbits = BITS_MAX;
318
+            }
316 319
             nbitsmask = MAXCODE(nbits);
317 320
             maxcodep = state->dec_codetab + nbitsmask-1;
318 321
         }
... ...
@@ -366,6 +374,7 @@ inf_end:
366 366
     strm->avail_out = left;
367 367
     strm->next_in = from;
368 368
     strm->avail_in = have;
369
+    strm->flags = flags;
369 370
 
370 371
     state->nbits = (uint16_t)nbits;
371 372
     state->nextdata = nextdata;
... ...
@@ -49,6 +49,8 @@ typedef struct lzw_stream_s {
49 49
     unsigned total_out;
50 50
 
51 51
     char *msg;
52
+
53
+    uint32_t flags;
52 54
     struct lzw_internal_state *state;
53 55
 } lzw_stream;
54 56
 
... ...
@@ -62,10 +64,13 @@ typedef lzw_stream *lzw_streamp;
62 62
 #define LZW_BUF_ERROR    (-5)
63 63
 #define LZW_DICT_ERROR   (-7)
64 64
 
65
-#define LZW_NOFLAGS        0
66
-#define LZW_FLAG_EARLYCHG  1
65
+/* option flags */
66
+#define LZW_NOFLAGS        0x0
67
+#define LZW_FLAG_EARLYCHG  0x1
68
+/* state flags */
69
+#define LZW_FLAG_BIGDICT   0x100
67 70
 
68
-int lzwInit(lzw_streamp strm, uint32_t flags);
71
+int lzwInit(lzw_streamp strm);
69 72
 int lzwInflate(lzw_streamp strm);
70 73
 int lzwInflateEnd(lzw_streamp strm);
71 74
 
... ...
@@ -957,8 +957,10 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
957 957
                 if (dparams)
958 958
                     pdf_free_dict(dparams);
959 959
 
960
-                if (sum < 0)
961
-                    return rc;
960
+                if (sum < 0 || (rc == CL_VIRUS && !(pdf->ctx->options & CL_SCAN_ALLMATCHES))) {
961
+                    sum = 0; /* prevents post-filter scan */
962
+                    break;
963
+                }
962 964
 
963 965
                 cli_dbgmsg("-------------EXPERIMENTAL-------------\n");
964 966
 
... ...
@@ -117,7 +117,13 @@ off_t pdf_decodestream(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_d
117 117
     cli_dbgmsg("cli_pdf: detected %lu applied filters\n", (long unsigned)(obj->numfilters));
118 118
 
119 119
     rv = pdf_decodestream_internal(pdf, obj, params, token);
120
-    /* return is ignored so that the existing content is dumped to file */
120
+    /* return is generally ignored */
121
+    if (rc) {
122
+        if (rv == CL_VIRUS)
123
+            *rc = CL_VIRUS;
124
+        else
125
+            *rc = CL_SUCCESS;
126
+    }
121 127
 
122 128
     if (!cli_checklimits("pdf", pdf->ctx, token->length, 0, 0)) {
123 129
         if (cli_writen(fout, token->content, token->length) != token->length) {
... ...
@@ -131,15 +137,13 @@ off_t pdf_decodestream(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_d
131 131
 
132 132
     free(token->content);
133 133
     free(token);
134
-    if (rc)
135
-        *rc = CL_SUCCESS;
136 134
     return rv;
137 135
 }
138 136
 
139 137
 static int pdf_decodestream_internal(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token)
140 138
 {
141 139
     const char *filter = NULL;
142
-    int i, rc = CL_SUCCESS;
140
+    int i, vir = 0, rc = CL_SUCCESS;
143 141
 
144 142
     /*
145 143
      * if pdf is decryptable, scan for CRYPT filter
... ...
@@ -211,22 +215,26 @@ static int pdf_decodestream_internal(struct pdf_struct *pdf, struct pdf_obj *obj
211 211
         }
212 212
 
213 213
         if (rc != CL_SUCCESS) {
214
-            const char *reason;
215
-            switch (rc) {
216
-            case CL_VIRUS:
217
-                reason = "detection";
218
-                break;
219
-            case CL_BREAK:
220
-                reason = "break decoding";
221
-                break;
222
-            default:
223
-                reason = "error decoding";
214
+            if (rc == CL_VIRUS && pdf->ctx->options & CL_SCAN_ALLMATCHES)
215
+                vir = 1;
216
+            else {
217
+                const char *reason;
218
+                switch (rc) {
219
+                case CL_VIRUS:
220
+                    reason = "detection";
221
+                    break;
222
+                case CL_BREAK:
223
+                    reason = "break decoding";
224
+                    break;
225
+                default:
226
+                    reason = "error decoding";
227
+                    break;
228
+                }
229
+
230
+                cli_dbgmsg("cli_pdf: %s, stopping after %d (of %lu) filters\n",
231
+                           reason, i, (long unsigned)(obj->numfilters));
224 232
                 break;
225 233
             }
226
-
227
-            cli_dbgmsg("cli_pdf: %s, stopping after %d (of %lu) filters\n",
228
-                       reason, i, (long unsigned)(obj->numfilters));
229
-            break;
230 234
         }
231 235
 
232 236
         if (cl_engine_get_num(pdf->ctx->engine, CL_ENGINE_FORCETODISK, NULL) &&
... ...
@@ -237,6 +245,8 @@ static int pdf_decodestream_internal(struct pdf_struct *pdf, struct pdf_obj *obj
237 237
         }
238 238
     }
239 239
 
240
+    if (vir)
241
+        return CL_VIRUS;
240 242
     if (rc == CL_BREAK)
241 243
         return CL_SUCCESS;
242 244
     return rc;
... ...
@@ -786,8 +796,10 @@ static int filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct
786 786
     stream.avail_in = length;
787 787
     stream.next_out = decoded;
788 788
     stream.avail_out = BUFSIZ;
789
+    if (echg)
790
+        stream.flags |= LZW_FLAG_EARLYCHG;
789 791
 
790
-    lzwstat = lzwInit(&stream, echg ? LZW_FLAG_EARLYCHG : LZW_NOFLAGS);
792
+    lzwstat = lzwInit(&stream);
791 793
     if(lzwstat != Z_OK) {
792 794
         cli_warnmsg("cli_pdf: lzwInit failed\n");
793 795
         free(decoded);
... ...
@@ -811,7 +823,7 @@ static int filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct
811 811
             stream.next_out = (Bytef *)decoded;
812 812
             stream.avail_out = capacity;
813 813
 
814
-            lzwstat = lzwInit(&stream, echg ? LZW_FLAG_EARLYCHG : LZW_NOFLAGS);
814
+            lzwstat = lzwInit(&stream);
815 815
             if(lzwstat != Z_OK) {
816 816
                 cli_warnmsg("cli_pdf: lzwInit failed\n");
817 817
                 free(decoded);
... ...
@@ -897,5 +909,11 @@ static int filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct
897 897
         free(decoded);
898 898
     }
899 899
 
900
+    /* heuristic check */
901
+    if (stream.flags & LZW_FLAG_BIGDICT) {
902
+        cli_append_virus(pdf->ctx, "Heuristics.PDF.LZWInvalidDictionary");
903
+        rc = CL_VIRUS;
904
+    }
905
+
900 906
     return rc;
901 907
 }