... | ... |
@@ -64,7 +64,10 @@ enum pdf_flag { |
64 | 64 |
BAD_PDF_VERSION=0, |
65 | 65 |
BAD_PDF_HEADERPOS, |
66 | 66 |
BAD_PDF_TRAILER, |
67 |
- BAD_PDF_TOOMANYOBJS |
|
67 |
+ BAD_PDF_TOOMANYOBJS, |
|
68 |
+ BAD_STREAM_FILTERS, |
|
69 |
+ UNTERMINATED_OBJ_DICT, |
|
70 |
+ ESCAPED_COMMON_PDFNAME, |
|
68 | 71 |
}; |
69 | 72 |
|
70 | 73 |
static int xrefCheck(const char *xref, const char *eof) |
... | ... |
@@ -89,7 +92,19 @@ static int xrefCheck(const char *xref, const char *eof) |
89 | 89 |
} |
90 | 90 |
|
91 | 91 |
enum objflags { |
92 |
- OBJ_STREAM=0 |
|
92 |
+ OBJ_STREAM=0, |
|
93 |
+ OBJ_DICT, |
|
94 |
+ OBJ_FILTER_AH, |
|
95 |
+ OBJ_FILTER_A85, |
|
96 |
+ OBJ_FILTER_FLATE, |
|
97 |
+ OBJ_FILTER_LZW, |
|
98 |
+ OBJ_FILTER_RL, |
|
99 |
+ OBJ_FILTER_FAX, |
|
100 |
+ OBJ_FILTER_JBIG2, |
|
101 |
+ OBJ_FILTER_DCT, |
|
102 |
+ OBJ_FILTER_JPX, |
|
103 |
+ OBJ_FILTER_CRYPT, |
|
104 |
+ OBJ_JAVASCRIPT |
|
93 | 105 |
}; |
94 | 106 |
|
95 | 107 |
struct pdf_obj { |
... | ... |
@@ -103,6 +118,7 @@ struct pdf_struct { |
103 | 103 |
const char *map; |
104 | 104 |
off_t size; |
105 | 105 |
off_t offset; |
106 |
+ unsigned flags; |
|
106 | 107 |
}; |
107 | 108 |
|
108 | 109 |
static const char *findNextNonWSBack(const char *q, const char *start) |
... | ... |
@@ -115,7 +131,7 @@ static const char *findNextNonWSBack(const char *q, const char *start) |
115 | 115 |
return q; |
116 | 116 |
} |
117 | 117 |
|
118 |
-static int pdf_parseobj(struct pdf_struct *pdf) |
|
118 |
+static int pdf_findobj(struct pdf_struct *pdf) |
|
119 | 119 |
{ |
120 | 120 |
const char *start, *q, *q2, *eof; |
121 | 121 |
struct pdf_obj *obj; |
... | ... |
@@ -152,7 +168,7 @@ static int pdf_parseobj(struct pdf_struct *pdf) |
152 | 152 |
if (!q2) |
153 | 153 |
return 0;/* no more objs */ |
154 | 154 |
bytesleft -= q2 - q; |
155 |
- if (!memcmp(q2, "stream", 6)) { |
|
155 |
+ if ((q2 = cli_memstr(q-1, q2-q+1, "stream", 6))) { |
|
156 | 156 |
obj->flags |= 1 << OBJ_STREAM; |
157 | 157 |
q2 += 6; |
158 | 158 |
bytesleft -= 6; |
... | ... |
@@ -161,22 +177,193 @@ static int pdf_parseobj(struct pdf_struct *pdf) |
161 | 161 |
return 0;/* no more objs */ |
162 | 162 |
q2 += 6; |
163 | 163 |
bytesleft -= q2 - q; |
164 |
- } else if (!memcmp(q2,"endobj",6)) { |
|
164 |
+ } else if ((q2 = cli_memstr(q-1, q2-q+1, "endobj", 6))) { |
|
165 | 165 |
q2 += 6; |
166 | 166 |
pdf->offset = q2 - pdf->map; |
167 | 167 |
return 1; /* obj found and offset positioned */ |
168 | 168 |
} else { |
169 |
- q2 = q+1; |
|
169 |
+ q2++; |
|
170 | 170 |
} |
171 | 171 |
q = q2; |
172 | 172 |
} |
173 | 173 |
return 0;/* no more objs */ |
174 | 174 |
} |
175 | 175 |
|
176 |
+static void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_flag flag) |
|
177 |
+{ |
|
178 |
+ const char *s; |
|
179 |
+ pdf->flags |= 1 << flag; |
|
180 |
+ if (!cli_debug_flag) |
|
181 |
+ return; |
|
182 |
+ switch (flag) { |
|
183 |
+ case UNTERMINATED_OBJ_DICT: |
|
184 |
+ s = "dictionary not terminated"; |
|
185 |
+ break; |
|
186 |
+ case ESCAPED_COMMON_PDFNAME: |
|
187 |
+ /* like /JavaScript */ |
|
188 |
+ s = "escaped common pdfname"; |
|
189 |
+ break; |
|
190 |
+ case BAD_STREAM_FILTERS: |
|
191 |
+ s = "duplicate stream filters"; |
|
192 |
+ break; |
|
193 |
+ case BAD_PDF_VERSION: |
|
194 |
+ case BAD_PDF_HEADERPOS: |
|
195 |
+ case BAD_PDF_TRAILER: |
|
196 |
+ case BAD_PDF_TOOMANYOBJS: |
|
197 |
+ return; |
|
198 |
+ } |
|
199 |
+ cli_dbgmsg("cli_pdf: %s in object %u %u\n", s, obj->id>>8, obj->id&0xff); |
|
200 |
+} |
|
201 |
+ |
|
202 |
+enum objstate { |
|
203 |
+ STATE_NONE, |
|
204 |
+ STATE_S, |
|
205 |
+ STATE_FILTER, |
|
206 |
+ STATE_ANY /* for actions table below */ |
|
207 |
+}; |
|
208 |
+ |
|
209 |
+struct pdfname_action { |
|
210 |
+ const char *pdfname; |
|
211 |
+ enum objflags set_objflag;/* OBJ_DICT is noop */ |
|
212 |
+ enum objstate from_state;/* STATE_NONE is noop */ |
|
213 |
+ enum objstate to_state; |
|
214 |
+}; |
|
215 |
+ |
|
216 |
+static struct pdfname_action pdfname_actions[] = { |
|
217 |
+ {"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER}, |
|
218 |
+ {"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER}, |
|
219 |
+ {"FlateDecode", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER}, |
|
220 |
+ {"LZWDecode", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER}, |
|
221 |
+ {"RunLengthDecode", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER}, |
|
222 |
+ {"CCITTFaxDecode", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER}, |
|
223 |
+ {"JBIG2Decode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER}, |
|
224 |
+ {"DCTDecode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER}, |
|
225 |
+ {"JPXDecode", OBJ_FILTER_JPX, STATE_FILTER, STATE_FILTER}, |
|
226 |
+ {"Crypt", OBJ_FILTER_CRYPT, STATE_FILTER, STATE_NONE}, |
|
227 |
+ {"Filter", OBJ_DICT, STATE_ANY, STATE_FILTER}, |
|
228 |
+ {"JavaScript", OBJ_JAVASCRIPT, STATE_S, STATE_NONE}, |
|
229 |
+ {"Length", OBJ_DICT, STATE_FILTER, STATE_NONE}, |
|
230 |
+ {"S", OBJ_DICT, STATE_NONE, STATE_S}, |
|
231 |
+ {"Type", OBJ_DICT, STATE_NONE, STATE_NONE} |
|
232 |
+}; |
|
233 |
+ |
|
234 |
+static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj, |
|
235 |
+ const char *pdfname, int escapes, |
|
236 |
+ const char *after, enum objstate *state) |
|
237 |
+{ |
|
238 |
+ struct pdfname_action *act = NULL; |
|
239 |
+ unsigned j; |
|
240 |
+ for (j=0;j<sizeof(pdfname_actions)/sizeof(pdfname_actions[0]);j++) { |
|
241 |
+ if (!strncmp(pdfname, pdfname_actions[j].pdfname, strlen(pdfname_actions[j].pdfname))) { |
|
242 |
+ act = &pdfname_actions[j]; |
|
243 |
+ break; |
|
244 |
+ } |
|
245 |
+ } |
|
246 |
+ if (!act) |
|
247 |
+ return; |
|
248 |
+ if (escapes) { |
|
249 |
+ /* if a commonly used PDF name is escaped that is certainly |
|
250 |
+ suspicious. */ |
|
251 |
+ cli_dbgmsg("cli_pdf: pdfname %s is escaped\n", pdfname); |
|
252 |
+ pdfobj_flag(pdf, obj, ESCAPED_COMMON_PDFNAME); |
|
253 |
+ } |
|
254 |
+ if (act->from_state == *state || |
|
255 |
+ act->from_state == STATE_ANY) { |
|
256 |
+ *state = act->to_state; |
|
257 |
+ |
|
258 |
+ if (*state == STATE_FILTER && |
|
259 |
+ act->set_objflag !=OBJ_DICT && |
|
260 |
+ (obj->flags & (1 << act->set_objflag))) { |
|
261 |
+ pdfobj_flag(pdf, obj, BAD_STREAM_FILTERS); |
|
262 |
+ } |
|
263 |
+ obj->flags |= 1 << act->set_objflag; |
|
264 |
+ } else { |
|
265 |
+ //auto-reset states |
|
266 |
+ switch (*state) { |
|
267 |
+ case STATE_S: |
|
268 |
+ *state = STATE_NONE; |
|
269 |
+ break; |
|
270 |
+ } |
|
271 |
+ } |
|
272 |
+} |
|
273 |
+ |
|
274 |
+static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
|
275 |
+{ |
|
276 |
+ /* enough to hold common pdf names, we don't need all the names */ |
|
277 |
+ char pdfname[64]; |
|
278 |
+ const char *q2, *q3; |
|
279 |
+ const char *q = obj->start + pdf->map; |
|
280 |
+ const char *dict, *start; |
|
281 |
+ off_t dict_length; |
|
282 |
+ off_t bytesleft = pdf->offset - obj->start; |
|
283 |
+ unsigned i; |
|
284 |
+ enum objstate objstate = STATE_NONE; |
|
285 |
+ |
|
286 |
+ if (bytesleft < 0) |
|
287 |
+ return; |
|
288 |
+ start = q; |
|
289 |
+ /* find start of dictionary */ |
|
290 |
+ do { |
|
291 |
+ q2 = pdf_nextobject(q, bytesleft); |
|
292 |
+ bytesleft -= q2 -q; |
|
293 |
+ if (!q2 || bytesleft < 0) { |
|
294 |
+ return; |
|
295 |
+ } |
|
296 |
+ q3 = memchr(q-1, '<', q2-q+1); |
|
297 |
+ q2++; |
|
298 |
+ bytesleft--; |
|
299 |
+ q = q2; |
|
300 |
+ } while (!q3 || q3[1] != '<'); |
|
301 |
+ dict = q3+2; |
|
302 |
+ q = dict; |
|
303 |
+ bytesleft = pdf->offset - obj->start - (q3 - start); |
|
304 |
+ /* find end of dictionary */ |
|
305 |
+ do { |
|
306 |
+ q2 = pdf_nextobject(q, bytesleft); |
|
307 |
+ bytesleft -= q2 -q; |
|
308 |
+ if (!q2 || bytesleft < 0) { |
|
309 |
+ return; |
|
310 |
+ } |
|
311 |
+ q3 = memchr(q-1, '>', q2-q+1); |
|
312 |
+ q2++; |
|
313 |
+ bytesleft--; |
|
314 |
+ q = q2; |
|
315 |
+ } while (!q3 || q3[1] != '>'); |
|
316 |
+ obj->flags |= 1 << OBJ_DICT; |
|
317 |
+ dict_length = q3 - dict; |
|
318 |
+ |
|
319 |
+ // process pdf names |
|
320 |
+ for (q = dict;dict_length;) { |
|
321 |
+ int escapes = 0; |
|
322 |
+ q2 = memchr(q, '/', dict_length); |
|
323 |
+ if (!q2) |
|
324 |
+ break; |
|
325 |
+ dict_length -= q2 - q; |
|
326 |
+ // normalize PDF names |
|
327 |
+ for (i = 0;dict_length && (i < sizeof(pdfname)-1); i++) { |
|
328 |
+ q++; |
|
329 |
+ dict_length--; |
|
330 |
+ if (*q == '#') { |
|
331 |
+ cli_hex2str_to(q+1, pdfname+i, 2); |
|
332 |
+ q += 2; |
|
333 |
+ dict_length -= 2; |
|
334 |
+ escapes = 1; |
|
335 |
+ continue; |
|
336 |
+ } |
|
337 |
+ if (*q == ' ' || *q == '\r' || *q == '\n') |
|
338 |
+ break; |
|
339 |
+ pdfname[i] = *q; |
|
340 |
+ } |
|
341 |
+ pdfname[i] = '\0'; |
|
342 |
+ |
|
343 |
+ handle_pdfname(pdf, obj, pdfname, escapes, q, &objstate); |
|
344 |
+ } |
|
345 |
+ cli_dbgmsg("cli_pdf: %u %u obj flags: %02x\n", obj->id>>8, obj->id&0xff, obj->flags); |
|
346 |
+} |
|
347 |
+ |
|
176 | 348 |
int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
177 | 349 |
{ |
178 | 350 |
struct pdf_struct pdf; |
179 |
- unsigned flags = 0; |
|
180 | 351 |
fmap_t *map = *ctx->fmap; |
181 | 352 |
size_t size = map->len - offset; |
182 | 353 |
off_t versize = size > 1032 ? 1032 : size; |
... | ... |
@@ -205,11 +392,11 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
205 | 205 |
* versions */ |
206 | 206 |
if (pdfver[5] != '1' || pdfver[6] != '.' || |
207 | 207 |
pdfver[7] < '1' || pdfver[7] > '9') { |
208 |
- flags |= 1 << BAD_PDF_VERSION; |
|
208 |
+ pdf.flags |= 1 << BAD_PDF_VERSION; |
|
209 | 209 |
cli_dbgmsg("cli_pdf: bad pdf version: %.8s\n", pdfver); |
210 | 210 |
} |
211 | 211 |
if (pdfver != start || offset) { |
212 |
- flags |= 1 << BAD_PDF_HEADERPOS; |
|
212 |
+ pdf.flags |= 1 << BAD_PDF_HEADERPOS; |
|
213 | 213 |
cli_dbgmsg("cli_pdf: PDF header is not at position 0: %d\n",pdfver-start+offset); |
214 | 214 |
} |
215 | 215 |
offset += pdfver - start; |
... | ... |
@@ -230,7 +417,7 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
230 | 230 |
break; |
231 | 231 |
} |
232 | 232 |
if (q <= eofmap) { |
233 |
- flags |= 1 << BAD_PDF_TRAILER; |
|
233 |
+ pdf.flags |= 1 << BAD_PDF_TRAILER; |
|
234 | 234 |
cli_dbgmsg("cli_pdf: %%%%EOF not found\n"); |
235 | 235 |
} else { |
236 | 236 |
size = q - eofmap + map_off; |
... | ... |
@@ -239,8 +426,8 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
239 | 239 |
break; |
240 | 240 |
} |
241 | 241 |
if (q <= eofmap) { |
242 |
- flags |= 1 << BAD_PDF_TRAILER; |
|
243 |
- cli_dbgmsg("cli_pdf: startxref not found\n"); |
|
242 |
+ pdf.flags |= 1 << BAD_PDF_TRAILER; |
|
243 |
+ cli_dbgmsg("cli_pdf: startxref not found\n"); |
|
244 | 244 |
} |
245 | 245 |
q += 9; |
246 | 246 |
while (q < eof && (*q == ' ' || *q == '\n' || *q == '\r')) { q++; } |
... | ... |
@@ -251,7 +438,7 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
251 | 251 |
q = fmap_need_off_once(map, offset + xref, bytesleft); |
252 | 252 |
if (!q || xrefCheck(q, q+bytesleft) == -1) { |
253 | 253 |
cli_dbgmsg("cli_pdf: did not find valid xref\n"); |
254 |
- flags |= 1 << BAD_PDF_TRAILER; |
|
254 |
+ pdf.flags |= 1 << BAD_PDF_TRAILER; |
|
255 | 255 |
} |
256 | 256 |
} |
257 | 257 |
|
... | ... |
@@ -261,15 +448,16 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
261 | 261 |
cli_errmsg("cli_pdf: mmap() failed\n"); |
262 | 262 |
return CL_EMAP; |
263 | 263 |
} |
264 |
- while ((rc = pdf_parseobj(&pdf)) > 0) { |
|
264 |
+ while ((rc = pdf_findobj(&pdf)) > 0) { |
|
265 | 265 |
struct pdf_obj *obj = &pdf.objs[pdf.nobjs-1]; |
266 | 266 |
cli_dbgmsg("found %d %d obj @%ld\n", obj->id >> 8, obj->id&0xff, obj->start + offset); |
267 |
+ pdf_parseobj(&pdf, obj); |
|
267 | 268 |
} |
268 | 269 |
if (rc == -1) |
269 |
- flags |= 1 << BAD_PDF_TOOMANYOBJS; |
|
270 |
+ pdf.flags |= 1 << BAD_PDF_TOOMANYOBJS; |
|
270 | 271 |
|
271 |
- if (flags) |
|
272 |
- cli_dbgmsg("cli_pdf: flags 0x%02x\n", flags); |
|
272 |
+ if (pdf.flags) |
|
273 |
+ cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags); |
|
273 | 274 |
return CL_SUCCESS; |
274 | 275 |
} |
275 | 276 |
|