Browse code

new pdf wip

Török Edvin authored on 2010/05/10 17:57:44
Showing 2 changed files
... ...
@@ -130,8 +130,8 @@ static const char *ftypes_int[] = {
130 130
   "0:0:5349502d48495420285349502f48:SIP log:CL_TYPE_ANY:CL_TYPE_IGNORED",
131 131
   "1:0:3c2540204c414e4755414745203d:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
132 132
   "0:0:7b5c727466:RTF:CL_TYPE_ANY:CL_TYPE_RTF:30",
133
-  "1:*:255044462d??2e:PDF:CL_TYPE_ANY:CL_TYPE_PDF:30",
134
-  "1:*:257064662d??2e:PDF:CL_TYPE_ANY:CL_TYPE_PDF:30",
133
+  "1:0,1024:255044462d??2e:PDF:CL_TYPE_ANY:CL_TYPE_PDF:30",
134
+  "1:0,1024:257064662d??2e:PDF:CL_TYPE_ANY:CL_TYPE_PDF:30",
135 135
   "0:257:7573746172:TAR-POSIX:CL_TYPE_ANY:CL_TYPE_POSIX_TAR",
136 136
   "0:0:5b616c69617365735d:mirc ini:CL_TYPE_ANY:CL_TYPE_SCRIPT",
137 137
   "1:0,1024:0a(46|66)726f6d3a20{-1024}0a(4d|6d)(49|69)(4d|6d)(45|65)2d(56|76)657273696f6e3a20:Mail file:CL_TYPE_ANY:CL_TYPE_MAIL",
... ...
@@ -60,8 +60,221 @@ static	int	ascii85decode(const char *buf, off_t len, unsigned char *output);
60 60
 static	const	char	*pdf_nextlinestart(const char *ptr, size_t len);
61 61
 static	const	char	*pdf_nextobject(const char *ptr, size_t len);
62 62
 
63
+enum pdf_flag {
64
+    BAD_PDF_VERSION=0,
65
+    BAD_PDF_HEADERPOS,
66
+    BAD_PDF_TRAILER,
67
+    BAD_PDF_TOOMANYOBJS
68
+};
69
+
70
+static int xrefCheck(const char *xref, const char *eof)
71
+{
72
+    const char *q;
73
+    while (xref < eof && *xref == ' ' || *xref == '\n' || *xref == '\r')
74
+	xref++;
75
+    if (xref + 4 >= eof)
76
+	return -1;
77
+    if (!memcmp(xref, "xref", 4)) {
78
+	cli_dbgmsg("cli_pdf: found xref\n");
79
+	return 0;
80
+    }
81
+    /* could be xref stream */
82
+    for (q=xref; q+5 < eof; q++) {
83
+	if (!memcmp(q,"/XRef",4)) {
84
+	    cli_dbgmsg("cli_pdf: found /XRef\n");
85
+	    return 0;
86
+	}
87
+    }
88
+    return -1;
89
+}
90
+
91
+enum objflags {
92
+    OBJ_STREAM=0
93
+};
94
+
95
+struct pdf_obj {
96
+    uint32_t start;
97
+    uint32_t id;
98
+    uint32_t flags;
99
+};
100
+struct pdf_struct {
101
+    struct pdf_obj *objs;
102
+    unsigned nobjs;
103
+    const char *map;
104
+    off_t size;
105
+    off_t offset;
106
+};
107
+
108
+static const char *findNextNonWSBack(const char *q, const char *start)
109
+{
110
+    while (q > start &&
111
+	   (*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20))
112
+    {
113
+	q--;
114
+    }
115
+    return q;
116
+}
117
+
118
+static int pdf_parseobj(struct pdf_struct *pdf)
119
+{
120
+    const char *start, *q, *q2, *eof;
121
+    struct pdf_obj *obj;
122
+    off_t bytesleft;
123
+    unsigned genid, objid;
124
+
125
+    pdf->nobjs++;
126
+    pdf->objs = cli_realloc2(pdf->objs, sizeof(*pdf->objs)*pdf->nobjs);
127
+    if (!pdf->objs) {
128
+	cli_warnmsg("cli_pdf: out of memory parsing objects (%ld)\n", pdf->nobjs);
129
+	return -1;
130
+    }
131
+    obj = &pdf->objs[pdf->nobjs-1];
132
+    start = pdf->map+pdf->offset;
133
+    bytesleft = pdf->size - pdf->offset;
134
+    q2 = cli_memstr(start, bytesleft, " obj", 4);
135
+    if (!q2)
136
+	return 0;/* no more objs */
137
+    bytesleft -= q2 - start;
138
+    q = findNextNonWSBack(q2-1, start);
139
+    while (q > start && isdigit(*q)) { q--; }
140
+    genid = atoi(q);
141
+    q = findNextNonWSBack(q-1,start);
142
+    while (q > start && isdigit(*q)) { q--; }
143
+    objid = atoi(q);
144
+    obj->id = (objid << 8) | (genid&0xff);
145
+    obj->start = q2+4 - pdf->map;
146
+    obj->flags = 0;
147
+    bytesleft -= 4;
148
+    eof = pdf->map + pdf->size;
149
+    q = pdf->map + obj->start;
150
+    while (q < eof && bytesleft > 0) {
151
+	q2 = pdf_nextobject(q, bytesleft);
152
+	if (!q2)
153
+	    return 0;/* no more objs */
154
+	bytesleft -= q2 - q;
155
+	if (!memcmp(q2, "stream", 6)) {
156
+	    obj->flags |= 1 << OBJ_STREAM;
157
+	    q2 += 6;
158
+	    bytesleft -= 6;
159
+	    q2 = cli_memstr(q2, bytesleft, "endstream", 9);
160
+	    if (!q2)
161
+		return 0;/* no more objs */
162
+	    q2 += 6;
163
+	    bytesleft -= q2 - q;
164
+	} else if (!memcmp(q2,"endobj",6)) {
165
+	    q2 += 6;
166
+	    pdf->offset = q2 - pdf->map;
167
+	    return 1; /* obj found and offset positioned */
168
+	} else {
169
+	    q2 = q+1;
170
+	}
171
+	q = q2;
172
+    }
173
+    return 0;/* no more objs */
174
+}
175
+
176
+int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
177
+{
178
+    struct pdf_struct pdf;
179
+    unsigned flags = 0;
180
+    fmap_t *map = *ctx->fmap;
181
+    size_t size = map->len - offset;
182
+    off_t versize = size > 1032 ? 1032 : size;
183
+    off_t map_off, bytesleft;
184
+    long xref;
185
+    const char *pdfver, *start, *eofmap, *q, *eof;
186
+    int rc;
187
+
188
+    cli_dbgmsg("in cli_pdf(%s)\n", dir);
189
+    memset(&pdf, 0, sizeof(pdf));
190
+
191
+    pdfver = start = fmap_need_off_once(map, offset, versize);
192
+
193
+    /* Check PDF version */
194
+    if (!pdfver) {
195
+	cli_errmsg("cli_pdf: mmap() failed\n");
196
+	return CL_EMAP;
197
+    }
198
+    /* offset is 0 when coming from filetype2 */
199
+    pdfver = cli_memstr(pdfver, versize, "%PDF-", 5);
200
+    if (!pdfver) {
201
+	cli_dbgmsg("cli_pdf: no PDF- header found\n");
202
+	return CL_SUCCESS;
203
+    }
204
+    /* Check for PDF-1.[0-9]. Although 1.7 is highest now, allow for future
205
+     * versions */
206
+    if (pdfver[5] != '1' || pdfver[6] != '.' ||
207
+	pdfver[7] < '1' || pdfver[7] > '9') {
208
+	flags |= 1 << BAD_PDF_VERSION;
209
+	cli_dbgmsg("cli_pdf: bad pdf version: %.8s\n", pdfver);
210
+    }
211
+    if (pdfver != start || offset) {
212
+	flags |= 1 << BAD_PDF_HEADERPOS;
213
+	cli_dbgmsg("cli_pdf: PDF header is not at position 0: %d\n",pdfver-start+offset);
214
+    }
215
+    offset += pdfver - start;
216
+
217
+    /* find trailer and xref, don't fail if not found */
218
+    map_off = map->len - 2048;
219
+    if (map_off < 0)
220
+	map_off = 0;
221
+    bytesleft = map->len - map_off;
222
+    eofmap = fmap_need_off_once(map, map_off, bytesleft);
223
+    if (!eofmap) {
224
+	cli_errmsg("cli_pdf: mmap() failed\n");
225
+	return CL_EMAP;
226
+    }
227
+    eof = eofmap + bytesleft;
228
+    for (q=&eofmap[bytesleft-5]; q > eofmap; q--) {
229
+	if (memcmp(q, "%%EOF", 5) == 0)
230
+	    break;
231
+    }
232
+    if (q <= eofmap) {
233
+	flags |= 1 << BAD_PDF_TRAILER;
234
+	cli_dbgmsg("cli_pdf: %%%%EOF not found\n");
235
+    } else {
236
+	size = q - eofmap + map_off;
237
+	for (;q > eofmap;q--) {
238
+	    if (memcmp(q, "startxref", 9) == 0)
239
+		break;
240
+	}
241
+	if (q <= eofmap) {
242
+	    flags |= 1 << BAD_PDF_TRAILER;
243
+    	    cli_dbgmsg("cli_pdf: startxref not found\n");
244
+	}
245
+	q += 9;
246
+	while (q < eof && (*q == ' ' || *q == '\n' || *q == '\r')) { q++; }
247
+	xref = atol(q);
248
+	bytesleft = map->len - offset - xref;
249
+	if (bytesleft > 4096)
250
+	    bytesleft = 4096;
251
+	q = fmap_need_off_once(map, offset + xref, bytesleft);
252
+	if (!q || xrefCheck(q, q+bytesleft) == -1) {
253
+	    cli_dbgmsg("cli_pdf: did not find valid xref\n");
254
+	    flags |= 1 << BAD_PDF_TRAILER;
255
+	}
256
+    }
257
+
258
+    pdf.size = size;
259
+    pdf.map = fmap_need_off_once(map, offset, size);
260
+    if (!pdf.map) {
261
+	cli_errmsg("cli_pdf: mmap() failed\n");
262
+	return CL_EMAP;
263
+    }
264
+    while ((rc = pdf_parseobj(&pdf)) > 0) {
265
+	struct pdf_obj *obj = &pdf.objs[pdf.nobjs-1];
266
+	cli_dbgmsg("found %d %d obj @%ld\n", obj->id >> 8, obj->id&0xff, obj->start + offset);
267
+    }
268
+    if (rc == -1)
269
+	flags |= 1 << BAD_PDF_TOOMANYOBJS;
270
+
271
+    if (flags)
272
+	cli_dbgmsg("cli_pdf: flags 0x%02x\n", flags);
273
+    return CL_SUCCESS;
274
+}
275
+
63 276
 int
64
-cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
277
+cli_pdfold(const char *dir, cli_ctx *ctx, off_t offset)
65 278
 {
66 279
 	off_t size;	/* total number of bytes in the file */
67 280
 	off_t bytesleft, trailerlength;