... | ... |
@@ -130,8 +130,8 @@ static const char *ftypes_int[] = { |
130 | 130 |
"0:0:5349502d48495420285349502f48:SIP log:CL_TYPE_ANY:CL_TYPE_IGNORED", |
131 | 131 |
"1:0:3c2540204c414e4755414745203d:HTML data:CL_TYPE_ANY:CL_TYPE_HTML", |
132 | 132 |
"0:0:7b5c727466:RTF:CL_TYPE_ANY:CL_TYPE_RTF:30", |
133 |
- "1:*:255044462d??2e:PDF:CL_TYPE_ANY:CL_TYPE_PDF:30", |
|
134 |
- "1:*:257064662d??2e:PDF:CL_TYPE_ANY:CL_TYPE_PDF:30", |
|
133 |
+ "1:0,1024:255044462d??2e:PDF:CL_TYPE_ANY:CL_TYPE_PDF:30", |
|
134 |
+ "1:0,1024:257064662d??2e:PDF:CL_TYPE_ANY:CL_TYPE_PDF:30", |
|
135 | 135 |
"0:257:7573746172:TAR-POSIX:CL_TYPE_ANY:CL_TYPE_POSIX_TAR", |
136 | 136 |
"0:0:5b616c69617365735d:mirc ini:CL_TYPE_ANY:CL_TYPE_SCRIPT", |
137 | 137 |
"1:0,1024:0a(46|66)726f6d3a20{-1024}0a(4d|6d)(49|69)(4d|6d)(45|65)2d(56|76)657273696f6e3a20:Mail file:CL_TYPE_ANY:CL_TYPE_MAIL", |
... | ... |
@@ -60,8 +60,221 @@ static int ascii85decode(const char *buf, off_t len, unsigned char *output); |
60 | 60 |
static const char *pdf_nextlinestart(const char *ptr, size_t len); |
61 | 61 |
static const char *pdf_nextobject(const char *ptr, size_t len); |
62 | 62 |
|
63 |
+enum pdf_flag { |
|
64 |
+ BAD_PDF_VERSION=0, |
|
65 |
+ BAD_PDF_HEADERPOS, |
|
66 |
+ BAD_PDF_TRAILER, |
|
67 |
+ BAD_PDF_TOOMANYOBJS |
|
68 |
+}; |
|
69 |
+ |
|
70 |
+static int xrefCheck(const char *xref, const char *eof) |
|
71 |
+{ |
|
72 |
+ const char *q; |
|
73 |
+ while (xref < eof && *xref == ' ' || *xref == '\n' || *xref == '\r') |
|
74 |
+ xref++; |
|
75 |
+ if (xref + 4 >= eof) |
|
76 |
+ return -1; |
|
77 |
+ if (!memcmp(xref, "xref", 4)) { |
|
78 |
+ cli_dbgmsg("cli_pdf: found xref\n"); |
|
79 |
+ return 0; |
|
80 |
+ } |
|
81 |
+ /* could be xref stream */ |
|
82 |
+ for (q=xref; q+5 < eof; q++) { |
|
83 |
+ if (!memcmp(q,"/XRef",4)) { |
|
84 |
+ cli_dbgmsg("cli_pdf: found /XRef\n"); |
|
85 |
+ return 0; |
|
86 |
+ } |
|
87 |
+ } |
|
88 |
+ return -1; |
|
89 |
+} |
|
90 |
+ |
|
91 |
+enum objflags { |
|
92 |
+ OBJ_STREAM=0 |
|
93 |
+}; |
|
94 |
+ |
|
95 |
+struct pdf_obj { |
|
96 |
+ uint32_t start; |
|
97 |
+ uint32_t id; |
|
98 |
+ uint32_t flags; |
|
99 |
+}; |
|
100 |
+struct pdf_struct { |
|
101 |
+ struct pdf_obj *objs; |
|
102 |
+ unsigned nobjs; |
|
103 |
+ const char *map; |
|
104 |
+ off_t size; |
|
105 |
+ off_t offset; |
|
106 |
+}; |
|
107 |
+ |
|
108 |
+static const char *findNextNonWSBack(const char *q, const char *start) |
|
109 |
+{ |
|
110 |
+ while (q > start && |
|
111 |
+ (*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20)) |
|
112 |
+ { |
|
113 |
+ q--; |
|
114 |
+ } |
|
115 |
+ return q; |
|
116 |
+} |
|
117 |
+ |
|
118 |
+static int pdf_parseobj(struct pdf_struct *pdf) |
|
119 |
+{ |
|
120 |
+ const char *start, *q, *q2, *eof; |
|
121 |
+ struct pdf_obj *obj; |
|
122 |
+ off_t bytesleft; |
|
123 |
+ unsigned genid, objid; |
|
124 |
+ |
|
125 |
+ pdf->nobjs++; |
|
126 |
+ pdf->objs = cli_realloc2(pdf->objs, sizeof(*pdf->objs)*pdf->nobjs); |
|
127 |
+ if (!pdf->objs) { |
|
128 |
+ cli_warnmsg("cli_pdf: out of memory parsing objects (%ld)\n", pdf->nobjs); |
|
129 |
+ return -1; |
|
130 |
+ } |
|
131 |
+ obj = &pdf->objs[pdf->nobjs-1]; |
|
132 |
+ start = pdf->map+pdf->offset; |
|
133 |
+ bytesleft = pdf->size - pdf->offset; |
|
134 |
+ q2 = cli_memstr(start, bytesleft, " obj", 4); |
|
135 |
+ if (!q2) |
|
136 |
+ return 0;/* no more objs */ |
|
137 |
+ bytesleft -= q2 - start; |
|
138 |
+ q = findNextNonWSBack(q2-1, start); |
|
139 |
+ while (q > start && isdigit(*q)) { q--; } |
|
140 |
+ genid = atoi(q); |
|
141 |
+ q = findNextNonWSBack(q-1,start); |
|
142 |
+ while (q > start && isdigit(*q)) { q--; } |
|
143 |
+ objid = atoi(q); |
|
144 |
+ obj->id = (objid << 8) | (genid&0xff); |
|
145 |
+ obj->start = q2+4 - pdf->map; |
|
146 |
+ obj->flags = 0; |
|
147 |
+ bytesleft -= 4; |
|
148 |
+ eof = pdf->map + pdf->size; |
|
149 |
+ q = pdf->map + obj->start; |
|
150 |
+ while (q < eof && bytesleft > 0) { |
|
151 |
+ q2 = pdf_nextobject(q, bytesleft); |
|
152 |
+ if (!q2) |
|
153 |
+ return 0;/* no more objs */ |
|
154 |
+ bytesleft -= q2 - q; |
|
155 |
+ if (!memcmp(q2, "stream", 6)) { |
|
156 |
+ obj->flags |= 1 << OBJ_STREAM; |
|
157 |
+ q2 += 6; |
|
158 |
+ bytesleft -= 6; |
|
159 |
+ q2 = cli_memstr(q2, bytesleft, "endstream", 9); |
|
160 |
+ if (!q2) |
|
161 |
+ return 0;/* no more objs */ |
|
162 |
+ q2 += 6; |
|
163 |
+ bytesleft -= q2 - q; |
|
164 |
+ } else if (!memcmp(q2,"endobj",6)) { |
|
165 |
+ q2 += 6; |
|
166 |
+ pdf->offset = q2 - pdf->map; |
|
167 |
+ return 1; /* obj found and offset positioned */ |
|
168 |
+ } else { |
|
169 |
+ q2 = q+1; |
|
170 |
+ } |
|
171 |
+ q = q2; |
|
172 |
+ } |
|
173 |
+ return 0;/* no more objs */ |
|
174 |
+} |
|
175 |
+ |
|
176 |
+int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
|
177 |
+{ |
|
178 |
+ struct pdf_struct pdf; |
|
179 |
+ unsigned flags = 0; |
|
180 |
+ fmap_t *map = *ctx->fmap; |
|
181 |
+ size_t size = map->len - offset; |
|
182 |
+ off_t versize = size > 1032 ? 1032 : size; |
|
183 |
+ off_t map_off, bytesleft; |
|
184 |
+ long xref; |
|
185 |
+ const char *pdfver, *start, *eofmap, *q, *eof; |
|
186 |
+ int rc; |
|
187 |
+ |
|
188 |
+ cli_dbgmsg("in cli_pdf(%s)\n", dir); |
|
189 |
+ memset(&pdf, 0, sizeof(pdf)); |
|
190 |
+ |
|
191 |
+ pdfver = start = fmap_need_off_once(map, offset, versize); |
|
192 |
+ |
|
193 |
+ /* Check PDF version */ |
|
194 |
+ if (!pdfver) { |
|
195 |
+ cli_errmsg("cli_pdf: mmap() failed\n"); |
|
196 |
+ return CL_EMAP; |
|
197 |
+ } |
|
198 |
+ /* offset is 0 when coming from filetype2 */ |
|
199 |
+ pdfver = cli_memstr(pdfver, versize, "%PDF-", 5); |
|
200 |
+ if (!pdfver) { |
|
201 |
+ cli_dbgmsg("cli_pdf: no PDF- header found\n"); |
|
202 |
+ return CL_SUCCESS; |
|
203 |
+ } |
|
204 |
+ /* Check for PDF-1.[0-9]. Although 1.7 is highest now, allow for future |
|
205 |
+ * versions */ |
|
206 |
+ if (pdfver[5] != '1' || pdfver[6] != '.' || |
|
207 |
+ pdfver[7] < '1' || pdfver[7] > '9') { |
|
208 |
+ flags |= 1 << BAD_PDF_VERSION; |
|
209 |
+ cli_dbgmsg("cli_pdf: bad pdf version: %.8s\n", pdfver); |
|
210 |
+ } |
|
211 |
+ if (pdfver != start || offset) { |
|
212 |
+ flags |= 1 << BAD_PDF_HEADERPOS; |
|
213 |
+ cli_dbgmsg("cli_pdf: PDF header is not at position 0: %d\n",pdfver-start+offset); |
|
214 |
+ } |
|
215 |
+ offset += pdfver - start; |
|
216 |
+ |
|
217 |
+ /* find trailer and xref, don't fail if not found */ |
|
218 |
+ map_off = map->len - 2048; |
|
219 |
+ if (map_off < 0) |
|
220 |
+ map_off = 0; |
|
221 |
+ bytesleft = map->len - map_off; |
|
222 |
+ eofmap = fmap_need_off_once(map, map_off, bytesleft); |
|
223 |
+ if (!eofmap) { |
|
224 |
+ cli_errmsg("cli_pdf: mmap() failed\n"); |
|
225 |
+ return CL_EMAP; |
|
226 |
+ } |
|
227 |
+ eof = eofmap + bytesleft; |
|
228 |
+ for (q=&eofmap[bytesleft-5]; q > eofmap; q--) { |
|
229 |
+ if (memcmp(q, "%%EOF", 5) == 0) |
|
230 |
+ break; |
|
231 |
+ } |
|
232 |
+ if (q <= eofmap) { |
|
233 |
+ flags |= 1 << BAD_PDF_TRAILER; |
|
234 |
+ cli_dbgmsg("cli_pdf: %%%%EOF not found\n"); |
|
235 |
+ } else { |
|
236 |
+ size = q - eofmap + map_off; |
|
237 |
+ for (;q > eofmap;q--) { |
|
238 |
+ if (memcmp(q, "startxref", 9) == 0) |
|
239 |
+ break; |
|
240 |
+ } |
|
241 |
+ if (q <= eofmap) { |
|
242 |
+ flags |= 1 << BAD_PDF_TRAILER; |
|
243 |
+ cli_dbgmsg("cli_pdf: startxref not found\n"); |
|
244 |
+ } |
|
245 |
+ q += 9; |
|
246 |
+ while (q < eof && (*q == ' ' || *q == '\n' || *q == '\r')) { q++; } |
|
247 |
+ xref = atol(q); |
|
248 |
+ bytesleft = map->len - offset - xref; |
|
249 |
+ if (bytesleft > 4096) |
|
250 |
+ bytesleft = 4096; |
|
251 |
+ q = fmap_need_off_once(map, offset + xref, bytesleft); |
|
252 |
+ if (!q || xrefCheck(q, q+bytesleft) == -1) { |
|
253 |
+ cli_dbgmsg("cli_pdf: did not find valid xref\n"); |
|
254 |
+ flags |= 1 << BAD_PDF_TRAILER; |
|
255 |
+ } |
|
256 |
+ } |
|
257 |
+ |
|
258 |
+ pdf.size = size; |
|
259 |
+ pdf.map = fmap_need_off_once(map, offset, size); |
|
260 |
+ if (!pdf.map) { |
|
261 |
+ cli_errmsg("cli_pdf: mmap() failed\n"); |
|
262 |
+ return CL_EMAP; |
|
263 |
+ } |
|
264 |
+ while ((rc = pdf_parseobj(&pdf)) > 0) { |
|
265 |
+ struct pdf_obj *obj = &pdf.objs[pdf.nobjs-1]; |
|
266 |
+ cli_dbgmsg("found %d %d obj @%ld\n", obj->id >> 8, obj->id&0xff, obj->start + offset); |
|
267 |
+ } |
|
268 |
+ if (rc == -1) |
|
269 |
+ flags |= 1 << BAD_PDF_TOOMANYOBJS; |
|
270 |
+ |
|
271 |
+ if (flags) |
|
272 |
+ cli_dbgmsg("cli_pdf: flags 0x%02x\n", flags); |
|
273 |
+ return CL_SUCCESS; |
|
274 |
+} |
|
275 |
+ |
|
63 | 276 |
int |
64 |
-cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
|
277 |
+cli_pdfold(const char *dir, cli_ctx *ctx, off_t offset) |
|
65 | 278 |
{ |
66 | 279 |
off_t size; /* total number of bytes in the file */ |
67 | 280 |
off_t bytesleft, trailerlength; |