Browse code

PDF names.

Török Edvin authored on 2010/05/11 05:41:34
Showing 1 changed files
... ...
@@ -64,7 +64,10 @@ enum pdf_flag {
64 64
     BAD_PDF_VERSION=0,
65 65
     BAD_PDF_HEADERPOS,
66 66
     BAD_PDF_TRAILER,
67
-    BAD_PDF_TOOMANYOBJS
67
+    BAD_PDF_TOOMANYOBJS,
68
+    BAD_STREAM_FILTERS,
69
+    UNTERMINATED_OBJ_DICT,
70
+    ESCAPED_COMMON_PDFNAME,
68 71
 };
69 72
 
70 73
 static int xrefCheck(const char *xref, const char *eof)
... ...
@@ -89,7 +92,19 @@ static int xrefCheck(const char *xref, const char *eof)
89 89
 }
90 90
 
91 91
 enum objflags {
92
-    OBJ_STREAM=0
92
+    OBJ_STREAM=0,
93
+    OBJ_DICT,
94
+    OBJ_FILTER_AH,
95
+    OBJ_FILTER_A85,
96
+    OBJ_FILTER_FLATE,
97
+    OBJ_FILTER_LZW,
98
+    OBJ_FILTER_RL,
99
+    OBJ_FILTER_FAX,
100
+    OBJ_FILTER_JBIG2,
101
+    OBJ_FILTER_DCT,
102
+    OBJ_FILTER_JPX,
103
+    OBJ_FILTER_CRYPT,
104
+    OBJ_JAVASCRIPT
93 105
 };
94 106
 
95 107
 struct pdf_obj {
... ...
@@ -103,6 +118,7 @@ struct pdf_struct {
103 103
     const char *map;
104 104
     off_t size;
105 105
     off_t offset;
106
+    unsigned flags;
106 107
 };
107 108
 
108 109
 static const char *findNextNonWSBack(const char *q, const char *start)
... ...
@@ -115,7 +131,7 @@ static const char *findNextNonWSBack(const char *q, const char *start)
115 115
     return q;
116 116
 }
117 117
 
118
-static int pdf_parseobj(struct pdf_struct *pdf)
118
+static int pdf_findobj(struct pdf_struct *pdf)
119 119
 {
120 120
     const char *start, *q, *q2, *eof;
121 121
     struct pdf_obj *obj;
... ...
@@ -152,7 +168,7 @@ static int pdf_parseobj(struct pdf_struct *pdf)
152 152
 	if (!q2)
153 153
 	    return 0;/* no more objs */
154 154
 	bytesleft -= q2 - q;
155
-	if (!memcmp(q2, "stream", 6)) {
155
+	if ((q2 = cli_memstr(q-1, q2-q+1, "stream", 6))) {
156 156
 	    obj->flags |= 1 << OBJ_STREAM;
157 157
 	    q2 += 6;
158 158
 	    bytesleft -= 6;
... ...
@@ -161,22 +177,193 @@ static int pdf_parseobj(struct pdf_struct *pdf)
161 161
 		return 0;/* no more objs */
162 162
 	    q2 += 6;
163 163
 	    bytesleft -= q2 - q;
164
-	} else if (!memcmp(q2,"endobj",6)) {
164
+	} else if ((q2 = cli_memstr(q-1, q2-q+1, "endobj", 6))) {
165 165
 	    q2 += 6;
166 166
 	    pdf->offset = q2 - pdf->map;
167 167
 	    return 1; /* obj found and offset positioned */
168 168
 	} else {
169
-	    q2 = q+1;
169
+	    q2++;
170 170
 	}
171 171
 	q = q2;
172 172
     }
173 173
     return 0;/* no more objs */
174 174
 }
175 175
 
176
+static void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_flag flag)
177
+{
178
+    const char *s;
179
+    pdf->flags |= 1 << flag;
180
+    if (!cli_debug_flag)
181
+	return;
182
+    switch (flag) {
183
+	case UNTERMINATED_OBJ_DICT:
184
+	    s = "dictionary not terminated";
185
+	    break;
186
+	case ESCAPED_COMMON_PDFNAME:
187
+	    /* like /JavaScript */
188
+	    s = "escaped common pdfname";
189
+	    break;
190
+	case BAD_STREAM_FILTERS:
191
+	    s = "duplicate stream filters";
192
+	    break;
193
+	case BAD_PDF_VERSION:
194
+	case BAD_PDF_HEADERPOS:
195
+	case BAD_PDF_TRAILER:
196
+	case BAD_PDF_TOOMANYOBJS:
197
+	    return;
198
+    }
199
+    cli_dbgmsg("cli_pdf: %s in object %u %u\n", s, obj->id>>8, obj->id&0xff);
200
+}
201
+
202
+enum objstate {
203
+    STATE_NONE,
204
+    STATE_S,
205
+    STATE_FILTER,
206
+    STATE_ANY /* for actions table below */
207
+};
208
+
209
+struct pdfname_action {
210
+    const char *pdfname;
211
+    enum objflags set_objflag;/* OBJ_DICT is noop */
212
+    enum objstate from_state;/* STATE_NONE is noop */
213
+    enum objstate to_state;
214
+};
215
+
216
+static struct pdfname_action pdfname_actions[] = {
217
+    {"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER},
218
+    {"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER},
219
+    {"FlateDecode", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER},
220
+    {"LZWDecode", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER},
221
+    {"RunLengthDecode", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER},
222
+    {"CCITTFaxDecode", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER},
223
+    {"JBIG2Decode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER},
224
+    {"DCTDecode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER},
225
+    {"JPXDecode", OBJ_FILTER_JPX, STATE_FILTER, STATE_FILTER},
226
+    {"Crypt",  OBJ_FILTER_CRYPT, STATE_FILTER, STATE_NONE},
227
+    {"Filter", OBJ_DICT, STATE_ANY, STATE_FILTER},
228
+    {"JavaScript", OBJ_JAVASCRIPT, STATE_S, STATE_NONE},
229
+    {"Length", OBJ_DICT, STATE_FILTER, STATE_NONE},
230
+    {"S", OBJ_DICT, STATE_NONE, STATE_S},
231
+    {"Type", OBJ_DICT, STATE_NONE, STATE_NONE}
232
+};
233
+
234
+static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj,
235
+			   const char *pdfname, int escapes,
236
+			   const char *after, enum objstate *state)
237
+{
238
+    struct pdfname_action *act = NULL;
239
+    unsigned j;
240
+    for (j=0;j<sizeof(pdfname_actions)/sizeof(pdfname_actions[0]);j++) {
241
+	if (!strncmp(pdfname, pdfname_actions[j].pdfname, strlen(pdfname_actions[j].pdfname))) {
242
+	    act = &pdfname_actions[j];
243
+	    break;
244
+	}
245
+    }
246
+    if (!act)
247
+	return;
248
+    if (escapes) {
249
+	/* if a commonly used PDF name is escaped that is certainly
250
+	   suspicious. */
251
+	cli_dbgmsg("cli_pdf: pdfname %s is escaped\n", pdfname);
252
+	pdfobj_flag(pdf, obj, ESCAPED_COMMON_PDFNAME);
253
+    }
254
+    if (act->from_state == *state ||
255
+	act->from_state == STATE_ANY) {
256
+	*state = act->to_state;
257
+
258
+	if (*state == STATE_FILTER &&
259
+	    act->set_objflag !=OBJ_DICT &&
260
+	    (obj->flags & (1 << act->set_objflag))) {
261
+	    pdfobj_flag(pdf, obj, BAD_STREAM_FILTERS);
262
+	}
263
+	obj->flags |= 1 << act->set_objflag;
264
+    } else {
265
+	//auto-reset states
266
+	switch (*state) {
267
+	    case STATE_S:
268
+		*state = STATE_NONE;
269
+		break;
270
+	}
271
+    }
272
+}
273
+
274
+static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
275
+{
276
+    /* enough to hold common pdf names, we don't need all the names */
277
+    char pdfname[64];
278
+    const char *q2, *q3;
279
+    const char *q = obj->start + pdf->map;
280
+    const char *dict, *start;
281
+    off_t dict_length;
282
+    off_t bytesleft = pdf->offset - obj->start;
283
+    unsigned i;
284
+    enum objstate objstate = STATE_NONE;
285
+
286
+    if (bytesleft < 0)
287
+	return;
288
+    start = q;
289
+    /* find start of dictionary */
290
+    do {
291
+	q2 = pdf_nextobject(q, bytesleft);
292
+	bytesleft -= q2 -q;
293
+	if (!q2 || bytesleft < 0) {
294
+	    return;
295
+	}
296
+	q3 = memchr(q-1, '<', q2-q+1);
297
+	q2++;
298
+	bytesleft--;
299
+	q = q2;
300
+    } while (!q3 || q3[1] != '<');
301
+    dict = q3+2;
302
+    q = dict;
303
+    bytesleft = pdf->offset - obj->start - (q3 - start);
304
+    /* find end of dictionary */
305
+    do {
306
+	q2 = pdf_nextobject(q, bytesleft);
307
+	bytesleft -= q2 -q;
308
+	if (!q2 || bytesleft < 0) {
309
+	    return;
310
+	}
311
+	q3 = memchr(q-1, '>', q2-q+1);
312
+	q2++;
313
+	bytesleft--;
314
+	q = q2;
315
+    } while (!q3 || q3[1] != '>');
316
+    obj->flags |= 1 << OBJ_DICT;
317
+    dict_length = q3 - dict;
318
+
319
+    // process pdf names
320
+    for (q = dict;dict_length;) {
321
+	int escapes = 0;
322
+	q2 = memchr(q, '/', dict_length);
323
+	if (!q2)
324
+	    break;
325
+	dict_length -= q2 - q;
326
+	// normalize PDF names
327
+	for (i = 0;dict_length && (i < sizeof(pdfname)-1); i++) {
328
+	    q++;
329
+	    dict_length--;
330
+	    if (*q == '#') {
331
+		cli_hex2str_to(q+1, pdfname+i, 2);
332
+		q += 2;
333
+		dict_length -= 2;
334
+		escapes = 1;
335
+		continue;
336
+	    }
337
+	    if (*q == ' ' || *q == '\r' || *q == '\n')
338
+		break;
339
+	    pdfname[i] = *q;
340
+	}
341
+	pdfname[i] = '\0';
342
+
343
+	handle_pdfname(pdf, obj, pdfname, escapes, q, &objstate);
344
+    }
345
+    cli_dbgmsg("cli_pdf: %u %u obj flags: %02x\n", obj->id>>8, obj->id&0xff, obj->flags);
346
+}
347
+
176 348
 int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
177 349
 {
178 350
     struct pdf_struct pdf;
179
-    unsigned flags = 0;
180 351
     fmap_t *map = *ctx->fmap;
181 352
     size_t size = map->len - offset;
182 353
     off_t versize = size > 1032 ? 1032 : size;
... ...
@@ -205,11 +392,11 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
205 205
      * versions */
206 206
     if (pdfver[5] != '1' || pdfver[6] != '.' ||
207 207
 	pdfver[7] < '1' || pdfver[7] > '9') {
208
-	flags |= 1 << BAD_PDF_VERSION;
208
+	pdf.flags |= 1 << BAD_PDF_VERSION;
209 209
 	cli_dbgmsg("cli_pdf: bad pdf version: %.8s\n", pdfver);
210 210
     }
211 211
     if (pdfver != start || offset) {
212
-	flags |= 1 << BAD_PDF_HEADERPOS;
212
+	pdf.flags |= 1 << BAD_PDF_HEADERPOS;
213 213
 	cli_dbgmsg("cli_pdf: PDF header is not at position 0: %d\n",pdfver-start+offset);
214 214
     }
215 215
     offset += pdfver - start;
... ...
@@ -230,7 +417,7 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
230 230
 	    break;
231 231
     }
232 232
     if (q <= eofmap) {
233
-	flags |= 1 << BAD_PDF_TRAILER;
233
+	pdf.flags |= 1 << BAD_PDF_TRAILER;
234 234
 	cli_dbgmsg("cli_pdf: %%%%EOF not found\n");
235 235
     } else {
236 236
 	size = q - eofmap + map_off;
... ...
@@ -239,8 +426,8 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
239 239
 		break;
240 240
 	}
241 241
 	if (q <= eofmap) {
242
-	    flags |= 1 << BAD_PDF_TRAILER;
243
-    	    cli_dbgmsg("cli_pdf: startxref not found\n");
242
+	    pdf.flags |= 1 << BAD_PDF_TRAILER;
243
+	    cli_dbgmsg("cli_pdf: startxref not found\n");
244 244
 	}
245 245
 	q += 9;
246 246
 	while (q < eof && (*q == ' ' || *q == '\n' || *q == '\r')) { q++; }
... ...
@@ -251,7 +438,7 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
251 251
 	q = fmap_need_off_once(map, offset + xref, bytesleft);
252 252
 	if (!q || xrefCheck(q, q+bytesleft) == -1) {
253 253
 	    cli_dbgmsg("cli_pdf: did not find valid xref\n");
254
-	    flags |= 1 << BAD_PDF_TRAILER;
254
+	    pdf.flags |= 1 << BAD_PDF_TRAILER;
255 255
 	}
256 256
     }
257 257
 
... ...
@@ -261,15 +448,16 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
261 261
 	cli_errmsg("cli_pdf: mmap() failed\n");
262 262
 	return CL_EMAP;
263 263
     }
264
-    while ((rc = pdf_parseobj(&pdf)) > 0) {
264
+    while ((rc = pdf_findobj(&pdf)) > 0) {
265 265
 	struct pdf_obj *obj = &pdf.objs[pdf.nobjs-1];
266 266
 	cli_dbgmsg("found %d %d obj @%ld\n", obj->id >> 8, obj->id&0xff, obj->start + offset);
267
+	pdf_parseobj(&pdf, obj);
267 268
     }
268 269
     if (rc == -1)
269
-	flags |= 1 << BAD_PDF_TOOMANYOBJS;
270
+	pdf.flags |= 1 << BAD_PDF_TOOMANYOBJS;
270 271
 
271
-    if (flags)
272
-	cli_dbgmsg("cli_pdf: flags 0x%02x\n", flags);
272
+    if (pdf.flags)
273
+	cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags);
273 274
     return CL_SUCCESS;
274 275
 }
275 276