git-svn: trunk@2534
Tomasz Kojm authored on 2006/12/04 09:10:46... | ... |
@@ -1,3 +1,8 @@ |
1 |
+Mon Dec 4 01:04:30 CET 2006 (tk) |
|
2 |
+--------------------------------- |
|
3 |
+ * libclamav: commit experimental code from Edvin for extracting embedded |
|
4 |
+ objects from RTF files |
|
5 |
+ |
|
1 | 6 |
Sat Dec 2 17:46:31 GMT 2006 (njh) |
2 | 7 |
---------------------------------- |
3 | 8 |
* libclamav/pst.c: Fix compilation errors |
... | ... |
@@ -79,7 +79,7 @@ LTLIBRARIES = $(lib_LTLIBRARIES) |
79 | 79 |
libclamav_la_DEPENDENCIES = |
80 | 80 |
am_libclamav_la_OBJECTS = matcher-ac.lo matcher-bm.lo matcher-ncore.lo \ |
81 | 81 |
matcher.lo md5.lo others.lo readdb.lo cvd.lo dsig.lo str.lo \ |
82 |
- scanners.lo filetypes.lo blob.lo mbox.lo message.lo \ |
|
82 |
+ scanners.lo filetypes.lo rtf.lo blob.lo mbox.lo message.lo \ |
|
83 | 83 |
snprintf.lo table.lo text.lo ole2_extract.lo vba_extract.lo \ |
84 | 84 |
msexpand.lo pe.lo cabd.lo lzxd.lo mszipd.lo qtmd.lo system.lo \ |
85 | 85 |
upx.lo htmlnorm.lo chmunpack.lo rebuildpe.lo petite.lo \ |
... | ... |
@@ -253,6 +253,8 @@ libclamav_la_SOURCES = \ |
253 | 253 |
scanners.h \ |
254 | 254 |
filetypes.c \ |
255 | 255 |
filetypes.h \ |
256 |
+ rtf.c \ |
|
257 |
+ rtf.h \ |
|
256 | 258 |
blob.c \ |
257 | 259 |
blob.h \ |
258 | 260 |
mbox.c \ |
... | ... |
@@ -466,6 +468,7 @@ distclean-compile: |
466 | 466 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/readdb.Plo@am__quote@ |
467 | 467 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rebuildpe.Plo@am__quote@ |
468 | 468 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/regex_list.Plo@am__quote@ |
469 |
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rtf.Plo@am__quote@ |
|
469 | 470 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scanners.Plo@am__quote@ |
470 | 471 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha256.Plo@am__quote@ |
471 | 472 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sis.Plo@am__quote@ |
... | ... |
@@ -123,6 +123,9 @@ static const struct cli_magic_s cli_magic[] = { |
123 | 123 |
{0, "\%PDF-", 5, "PDF document", CL_TYPE_PDF}, |
124 | 124 |
{0, "\266\271\254\256\376\377\377\377", 8, "CryptFF", CL_TYPE_CRYPTFF}, |
125 | 125 |
|
126 |
+#ifdef CL_EXPERIMENTAL |
|
127 |
+ {0, "{\\rtf", 5, "RTF", CL_TYPE_RTF}, |
|
128 |
+#endif |
|
126 | 129 |
/* Ignored types */ |
127 | 130 |
|
128 | 131 |
{0, "\000\000\001\263", 4, "MPEG video stream", CL_TYPE_DATA}, |
... | ... |
@@ -54,7 +54,9 @@ typedef enum { |
54 | 54 |
CL_TYPE_UUENCODED, |
55 | 55 |
CL_TYPE_PST, /* Microsoft Outlook binary email folder (.pst file) */ |
56 | 56 |
CL_TYPE_HTML_UTF16, |
57 |
- |
|
57 |
+#ifdef CL_EXPERIMENTAL |
|
58 |
+ CL_TYPE_RTF, |
|
59 |
+#endif |
|
58 | 60 |
/* bigger numbers have higher priority (in o-t-f detection) */ |
59 | 61 |
CL_TYPE_HTML, /* on the fly */ |
60 | 62 |
CL_TYPE_MAIL, /* magic + on the fly */ |
61 | 63 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,661 @@ |
0 |
+/* |
|
1 | ||
2 |
+ * |
|
3 |
+ * This program is free software; you can redistribute it and/or modify |
|
4 |
+ * it under the terms of the GNU General Public License as published by |
|
5 |
+ * the Free Software Foundation; either version 2 of the License, or |
|
6 |
+ * (at your option) any later version. |
|
7 |
+ * |
|
8 |
+ * This program is distributed in the hope that it will be useful, |
|
9 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
11 |
+ * GNU General Public License for more details. |
|
12 |
+ * |
|
13 |
+ * You should have received a copy of the GNU General Public License |
|
14 |
+ * along with this program; if not, write to the Free Software |
|
15 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|
16 |
+ * MA 02110-1301, USA. |
|
17 |
+ */ |
|
18 |
+ |
|
19 |
+#if HAVE_CONFIG_H |
|
20 |
+#include "clamav-config.h" |
|
21 |
+#endif |
|
22 |
+ |
|
23 |
+#ifdef CL_EXPERIMENTAL |
|
24 |
+ |
|
25 |
+#include <stdio.h> |
|
26 |
+#include <string.h> |
|
27 |
+#include <sys/types.h> |
|
28 |
+#include <ctype.h> |
|
29 |
+ |
|
30 |
+#ifdef HAVE_UNISTD_H |
|
31 |
+#include <unistd.h> |
|
32 |
+#endif |
|
33 |
+ |
|
34 |
+#include "others.h" |
|
35 |
+#include "rtf.h" |
|
36 |
+#include "clamav.h" |
|
37 |
+#include "table.h" |
|
38 |
+#include "scanners.h" |
|
39 |
+#include "vba_extract.h" |
|
40 |
+ |
|
41 |
+enum parse_state { PARSE_MAIN, PARSE_CONTROL_, PARSE_CONTROL_WORD, PARSE_CONTROL_SYMBOL, PARSE_CONTROL_WORD_PARAM, PARSE_INTERPRET_CONTROLWORD }; |
|
42 |
+ |
|
43 |
+enum rtf_action |
|
44 |
+{ |
|
45 |
+ RTF_OBJECT, |
|
46 |
+ RTF_OBJECT_DATA |
|
47 |
+}; |
|
48 |
+ |
|
49 |
+struct rtf_state; |
|
50 |
+typedef int (*rtf_callback_begin)(struct rtf_state*, cli_ctx* ctx,const char* tmpdir); |
|
51 |
+typedef int (*rtf_callback_process)(struct rtf_state*, const unsigned char* data,const size_t len); |
|
52 |
+typedef int (*rtf_callback_end)(struct rtf_state*, cli_ctx*); |
|
53 |
+ |
|
54 |
+struct rtf_state { |
|
55 |
+ size_t default_elements; |
|
56 |
+ size_t controlword_cnt; |
|
57 |
+ ssize_t controlword_param; |
|
58 |
+ enum parse_state parse_state; |
|
59 |
+ int controlword_param_sign; |
|
60 |
+ int encounteredTopLevel;/* encountered top-level control words that we care about */ |
|
61 |
+ char controlword[33]; |
|
62 |
+ rtf_callback_begin cb_begin;/* must be non-null if you want cb_process, and cb_end to be called, also it must change cb_data to non-null */ |
|
63 |
+ rtf_callback_process cb_process; |
|
64 |
+ rtf_callback_end cb_end; |
|
65 |
+ void* cb_data;/* data set up by cb_begin, used by cb_process, and cleaned up by cb_end. typically state data */ |
|
66 |
+}; |
|
67 |
+ |
|
68 |
+static const struct rtf_state base_state = { |
|
69 |
+ 0,0,0,PARSE_MAIN,0,0," ",NULL,NULL,NULL,NULL |
|
70 |
+}; |
|
71 |
+ |
|
72 |
+struct stack { |
|
73 |
+ struct rtf_state* states; |
|
74 |
+ size_t elements; |
|
75 |
+ size_t stack_cnt; |
|
76 |
+ size_t stack_size; |
|
77 |
+}; |
|
78 |
+ |
|
79 |
+static const struct rtf_action_mapping { |
|
80 |
+ const char* controlword; |
|
81 |
+ const enum rtf_action action; |
|
82 |
+} rtf_action_mapping [] = |
|
83 |
+{ |
|
84 |
+ {"object", RTF_OBJECT}, |
|
85 |
+ {"objdata ",RTF_OBJECT_DATA} |
|
86 |
+}; |
|
87 |
+ |
|
88 |
+static const size_t rtf_action_mapping_cnt = sizeof(rtf_action_mapping)/sizeof(rtf_action_mapping[0]); |
|
89 |
+ |
|
90 |
+enum rtf_objdata_state {WAIT_MAGIC, WAIT_DESC_LEN, WAIT_DESC, WAIT_ZERO, WAIT_DATA_SIZE, DUMP_DATA, DUMP_DISCARD}; |
|
91 |
+static const unsigned char rtf_data_magic[] = {0x01, 0x05, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00};/* is this a magic number, or does it mean something */ |
|
92 |
+static const size_t rtf_data_magic_len = sizeof(rtf_data_magic); |
|
93 |
+ |
|
94 |
+struct rtf_object_data { |
|
95 |
+ char* name; |
|
96 |
+ int fd; |
|
97 |
+ int partial; |
|
98 |
+ int has_partial; |
|
99 |
+ enum rtf_objdata_state internal_state; |
|
100 |
+ char* desc_name; |
|
101 |
+ const char* tmpdir; |
|
102 |
+ cli_ctx* ctx; |
|
103 |
+ size_t desc_len; |
|
104 |
+ size_t bread; |
|
105 |
+}; |
|
106 |
+ |
|
107 |
+#define BUFF_SIZE 8192 |
|
108 |
+/* generated by contrib/phishing/generate_tables.c */ |
|
109 |
+static const short int hextable[256] = { |
|
110 |
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
111 |
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
112 |
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
113 |
+ 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
114 |
+ 0x0, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
115 |
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
116 |
+ 0x0, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
117 |
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
118 |
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
119 |
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
120 |
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
121 |
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
122 |
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
123 |
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
124 |
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
|
125 |
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 |
|
126 |
+}; |
|
127 |
+ |
|
128 |
+extern int short cli_leavetemps_flag; |
|
129 |
+ |
|
130 |
+static void init_rtf_state(struct rtf_state* state) |
|
131 |
+{ |
|
132 |
+ *state = base_state; |
|
133 |
+ state->parse_state = PARSE_MAIN; |
|
134 |
+ state->controlword_cnt = 0; |
|
135 |
+} |
|
136 |
+ |
|
137 |
+static int compare_state(const struct rtf_state* a,const struct rtf_state* b) |
|
138 |
+{ |
|
139 |
+ return (a->controlword_param == b->controlword_param && |
|
140 |
+ a->parse_state == b->parse_state && |
|
141 |
+ a->encounteredTopLevel == b->encounteredTopLevel && |
|
142 |
+ memcmp(a->controlword,b->controlword,33)==0 && |
|
143 |
+ a->cb_begin == b->cb_begin && |
|
144 |
+ a->cb_process == b->cb_process && |
|
145 |
+ a->cb_end == b->cb_end && |
|
146 |
+ a->cb_data == b->cb_data); |
|
147 |
+} |
|
148 |
+ |
|
149 |
+ |
|
150 |
+static int push_state(struct stack* stack,struct rtf_state* state) |
|
151 |
+{ |
|
152 |
+ int toplevel; |
|
153 |
+ size_t defelements; |
|
154 |
+ |
|
155 |
+ stack->elements++; |
|
156 |
+ if( compare_state(state,&base_state)) { |
|
157 |
+ state->default_elements++; |
|
158 |
+ return 0;/* this is default state, don't push it, we'll know when we pop it that it was the default one, |
|
159 |
+ we store in the state how many default elements we have on the stack */ |
|
160 |
+ } |
|
161 |
+ if(stack->stack_cnt >= stack->stack_size) { |
|
162 |
+ /* grow stack */ |
|
163 |
+ stack->stack_size += 128; |
|
164 |
+ stack->states = cli_realloc(stack->states, stack->stack_size*sizeof(*stack->states)); |
|
165 |
+ if(!stack->states) |
|
166 |
+ return CL_EMEM; |
|
167 |
+ } |
|
168 |
+ stack->states[stack->stack_cnt++] = *state; |
|
169 |
+ toplevel = state->encounteredTopLevel; |
|
170 |
+ defelements = state->default_elements; |
|
171 |
+ |
|
172 |
+ *state = base_state; |
|
173 |
+ |
|
174 |
+ state->encounteredTopLevel = toplevel; |
|
175 |
+ state->default_elements = defelements; |
|
176 |
+ return 0; |
|
177 |
+} |
|
178 |
+ |
|
179 |
+ |
|
180 |
+static int pop_state(struct stack* stack,struct rtf_state* state) |
|
181 |
+{ |
|
182 |
+ stack->elements--; |
|
183 |
+ if(state->default_elements) { |
|
184 |
+ const size_t default_elements = state->default_elements-1; |
|
185 |
+ const int toplevel = state->encounteredTopLevel; |
|
186 |
+ *state = base_state; |
|
187 |
+ state->default_elements = default_elements; |
|
188 |
+ state->encounteredTopLevel = toplevel; |
|
189 |
+ return 0;/* this is a default 'state'*/ |
|
190 |
+ } |
|
191 |
+ if(!stack->stack_cnt) { |
|
192 |
+ cli_dbgmsg("Warning: attempt to pop from empty stack!\n"); |
|
193 |
+ *state = base_state;/* lets assume we give it a base state */ |
|
194 |
+ return 0; |
|
195 |
+ } |
|
196 |
+ *state = stack->states[--stack->stack_cnt]; |
|
197 |
+ return 0; |
|
198 |
+} |
|
199 |
+ |
|
200 |
+ |
|
201 |
+static int load_actions(table_t* t) |
|
202 |
+{ |
|
203 |
+ size_t i; |
|
204 |
+ for(i=0; i<rtf_action_mapping_cnt; i++) |
|
205 |
+ tableInsert(t, rtf_action_mapping[i].controlword, rtf_action_mapping[i].action); |
|
206 |
+ return 0; |
|
207 |
+} |
|
208 |
+ |
|
209 |
+static int rtf_object_begin(struct rtf_state* state,cli_ctx* ctx,const char* tmpdir) |
|
210 |
+{ |
|
211 |
+ struct rtf_object_data* data = cli_malloc(sizeof(*data)); |
|
212 |
+ if(!data) |
|
213 |
+ return CL_EMEM; |
|
214 |
+ data->fd = -1; |
|
215 |
+ data->partial = 0; |
|
216 |
+ data->has_partial = 0; |
|
217 |
+ data->bread = 0; |
|
218 |
+ data->internal_state = WAIT_MAGIC; |
|
219 |
+ data->tmpdir = tmpdir; |
|
220 |
+ data->ctx = ctx; |
|
221 |
+ |
|
222 |
+ state->cb_data = data; |
|
223 |
+ return 0; |
|
224 |
+} |
|
225 |
+ |
|
226 |
+ |
|
227 |
+static int decode_and_scan(struct rtf_object_data* data, cli_ctx* ctx) |
|
228 |
+{ |
|
229 |
+ int ofd, ret=0; |
|
230 |
+ |
|
231 |
+ cli_dbgmsg("Scanning embedded object:%s\n",data->name); |
|
232 |
+ if(data->bread == 1) { |
|
233 |
+ cli_dbgmsg("Decoding ole object\n"); |
|
234 |
+ lseek(data->fd,0,SEEK_SET); |
|
235 |
+ ofd = cli_decode_ole_object(data->fd,data->tmpdir); |
|
236 |
+ if (ofd >= 0) { |
|
237 |
+ ret = cli_magic_scandesc(ofd, ctx); |
|
238 |
+ close(ofd); |
|
239 |
+ } |
|
240 |
+ } |
|
241 |
+ else |
|
242 |
+ ret = cli_magic_scandesc(data->fd,ctx); |
|
243 |
+ close(data->fd); |
|
244 |
+ data->fd = 0; |
|
245 |
+ if(data->name) { |
|
246 |
+ if(!cli_leavetemps_flag) |
|
247 |
+ unlink(data->name); |
|
248 |
+ free(data->name); |
|
249 |
+ data->name = NULL; |
|
250 |
+ } |
|
251 |
+ |
|
252 |
+ if(ret != CL_CLEAN) |
|
253 |
+ return ret; |
|
254 |
+ return 0; |
|
255 |
+} |
|
256 |
+ |
|
257 |
+static int rtf_object_process(struct rtf_state* state, const unsigned char* input,const size_t len) |
|
258 |
+{ |
|
259 |
+ struct rtf_object_data* data = state->cb_data; |
|
260 |
+ unsigned char outdata[BUFF_SIZE]; |
|
261 |
+ const unsigned char* out_data; |
|
262 |
+ size_t out_cnt = 0; |
|
263 |
+ size_t i; |
|
264 |
+ |
|
265 |
+ if(!data || !len) |
|
266 |
+ return 0; |
|
267 |
+ |
|
268 |
+ if(data->has_partial) { |
|
269 |
+ outdata[out_cnt++] = data->partial | input[0]; |
|
270 |
+ } |
|
271 |
+ |
|
272 |
+ data->has_partial = 0; |
|
273 |
+ for(i=0;i<len;i++) { |
|
274 |
+ if(isxdigit(input[i])) { |
|
275 |
+ const unsigned char byte = hextable[ input[i++] ] << 4; |
|
276 |
+ while(i<len && !isxdigit(input[i])) |
|
277 |
+ i++; |
|
278 |
+ if(i == len) { |
|
279 |
+ data->partial = byte; |
|
280 |
+ data->has_partial = 1; |
|
281 |
+ break; |
|
282 |
+ } |
|
283 |
+ outdata[out_cnt++] = byte | hextable[ input[i] ]; |
|
284 |
+ } |
|
285 |
+ } |
|
286 |
+ |
|
287 |
+ out_data = outdata; |
|
288 |
+ while(out_data && out_cnt) { |
|
289 |
+ switch(data->internal_state) { |
|
290 |
+ case WAIT_MAGIC: { |
|
291 |
+ for(i=0; i<out_cnt && data->bread < rtf_data_magic_len; i++, data->bread++) |
|
292 |
+ if(rtf_data_magic[data->bread] != out_data[i]) { |
|
293 |
+ cli_dbgmsg("Warning: rtf objdata magic number not matched, expected:%d, got: %d, at pos:%d\n",rtf_data_magic[i],out_data[i],data->bread); |
|
294 |
+ } |
|
295 |
+ out_cnt -= i; |
|
296 |
+ if(data->bread == rtf_data_magic_len) { |
|
297 |
+ out_data += i; |
|
298 |
+ data->bread = 0; |
|
299 |
+ data->internal_state = WAIT_DESC_LEN; |
|
300 |
+ } |
|
301 |
+ break; |
|
302 |
+ } |
|
303 |
+ case WAIT_DESC_LEN: { |
|
304 |
+ if(data->bread == 0) |
|
305 |
+ data->desc_len = 0; |
|
306 |
+ for(i=0; i<out_cnt && data->bread < 4; i++,data->bread++) |
|
307 |
+ data->desc_len |= ((size_t)out_data[i]) << (data->bread*8); |
|
308 |
+ out_cnt -= i; |
|
309 |
+ if(data->bread == 4) { |
|
310 |
+ out_data += i; |
|
311 |
+ data->bread=0; |
|
312 |
+ if(data->desc_len > 64) { |
|
313 |
+ cli_dbgmsg("Description length too big (%d), showing only 64 bytes of it\n",data->desc_len); |
|
314 |
+ data->desc_name = cli_malloc(65); |
|
315 |
+ } |
|
316 |
+ else |
|
317 |
+ data->desc_name = cli_malloc(data->desc_len+1); |
|
318 |
+ if(!data->desc_name) { |
|
319 |
+ return CL_EMEM; |
|
320 |
+ } |
|
321 |
+ data->internal_state = WAIT_DESC; |
|
322 |
+ } |
|
323 |
+ break; |
|
324 |
+ } |
|
325 |
+ case WAIT_DESC:{ |
|
326 |
+ for(i=0;i<out_cnt && data->bread < data->desc_len && data->bread < 64;i++, data->bread++) |
|
327 |
+ data->desc_name[data->bread] = out_data[i]; |
|
328 |
+ /*FIXME: sanity check here, to avoid segfault */ |
|
329 |
+ if(i+data->desc_len-data->bread > out_cnt) { |
|
330 |
+ cli_dbgmsg("Can't interpret length in wait_desc\n"); |
|
331 |
+ return 0;/* bail out */ |
|
332 |
+ } |
|
333 |
+ out_cnt -= i + data->desc_len - data->bread; |
|
334 |
+ if(data->bread <= data->desc_len) { |
|
335 |
+ out_data += i + data->desc_len - data->bread; |
|
336 |
+ data->desc_name[data->bread] = '\0'; |
|
337 |
+ data->bread = 0; |
|
338 |
+ cli_dbgmsg("Preparing to dump rtf embedded object, description:%s\n",data->desc_name); |
|
339 |
+ free(data->desc_name); |
|
340 |
+ data->desc_name = NULL; |
|
341 |
+ data->internal_state = WAIT_ZERO; |
|
342 |
+ } |
|
343 |
+ break; |
|
344 |
+ } |
|
345 |
+ case WAIT_ZERO:{ |
|
346 |
+ if(out_cnt < 8-data->bread) { |
|
347 |
+ out_cnt = 0; |
|
348 |
+ data->bread += out_cnt; |
|
349 |
+ } |
|
350 |
+ else { |
|
351 |
+ out_cnt -= 8-data->bread; |
|
352 |
+ data->bread = 8; |
|
353 |
+ } |
|
354 |
+ if(data->bread == 8) { |
|
355 |
+ out_data += 8; |
|
356 |
+ data->bread = 0; |
|
357 |
+ data->internal_state = WAIT_DATA_SIZE; |
|
358 |
+ } |
|
359 |
+ break; |
|
360 |
+ } |
|
361 |
+ |
|
362 |
+ case WAIT_DATA_SIZE: { |
|
363 |
+ if(data->bread == 0) |
|
364 |
+ data->desc_len = 0; |
|
365 |
+ for(i=0; i<out_cnt && data->bread < 4; i++,data->bread++) |
|
366 |
+ data->desc_len |= ((size_t)out_data[i]) << (8*data->bread); |
|
367 |
+ out_cnt -= i; |
|
368 |
+ if(data->bread == 4) { |
|
369 |
+ out_data += i; |
|
370 |
+ data->bread=0; |
|
371 |
+ cli_dbgmsg("Dumping rtf embedded object of size:%ld\n",data->desc_len); |
|
372 |
+ data->name = cli_gentempdesc(data->tmpdir, &data->fd); |
|
373 |
+ if(!data->name) |
|
374 |
+ return CL_ETMPFILE; |
|
375 |
+ data->internal_state = DUMP_DATA; |
|
376 |
+ } |
|
377 |
+ break; |
|
378 |
+ } |
|
379 |
+ case DUMP_DATA: { |
|
380 |
+ ssize_t out_want = out_cnt < data->desc_len ? out_cnt : data->desc_len; |
|
381 |
+ if(!data->bread) { |
|
382 |
+ if(out_data[0] != 0xd0 || out_data[1]!=0xcf) { |
|
383 |
+ /* this is not an ole2 doc, but some ole (stream?) to be |
|
384 |
+ * decoded by cli_decode_ole_object*/ |
|
385 |
+ char out[4]; |
|
386 |
+ data->bread = 1;/* flag to indicate this needs to be scanned with cli_decode_ole_object*/ |
|
387 |
+ cli_writeint32(out,data->desc_len); |
|
388 |
+ if(cli_writen(data->fd,out,4)!=4) |
|
389 |
+ return CL_EIO; |
|
390 |
+ } |
|
391 |
+ else |
|
392 |
+ data->bread = 2; |
|
393 |
+ } |
|
394 |
+ |
|
395 |
+ data->desc_len -= out_want; |
|
396 |
+ if(cli_writen(data->fd,out_data,out_want) != out_want) { |
|
397 |
+ return CL_EIO; |
|
398 |
+ } |
|
399 |
+ out_data += out_want; |
|
400 |
+ out_cnt -= out_want; |
|
401 |
+ if(!data->desc_len) { |
|
402 |
+ int rc; |
|
403 |
+ if(( rc = decode_and_scan(data, data->ctx) )) |
|
404 |
+ return rc; |
|
405 |
+ data->bread=0; |
|
406 |
+ data->internal_state = WAIT_MAGIC; |
|
407 |
+ } |
|
408 |
+ break; |
|
409 |
+ } |
|
410 |
+ case DUMP_DISCARD: |
|
411 |
+ default: |
|
412 |
+ out_cnt = 0; |
|
413 |
+ ; |
|
414 |
+ } |
|
415 |
+ } |
|
416 |
+ return 0; |
|
417 |
+} |
|
418 |
+ |
|
419 |
+ |
|
420 |
+ |
|
421 |
+static int rtf_object_end(struct rtf_state* state,cli_ctx* ctx) |
|
422 |
+{ |
|
423 |
+ struct rtf_object_data* data = state->cb_data; |
|
424 |
+ int rc = 0; |
|
425 |
+ if(!data) |
|
426 |
+ return 0; |
|
427 |
+ if(data->fd) { |
|
428 |
+ rc = decode_and_scan(data, ctx); |
|
429 |
+ } |
|
430 |
+ if(data->name) |
|
431 |
+ free(data->name); |
|
432 |
+ if(data->desc_name) |
|
433 |
+ free(data->desc_name); |
|
434 |
+ free(data); |
|
435 |
+ state->cb_data = NULL; |
|
436 |
+ return rc; |
|
437 |
+} |
|
438 |
+ |
|
439 |
+ |
|
440 |
+static void rtf_action(struct rtf_state* state,long action, const char* tempname,cli_ctx* ctx) |
|
441 |
+{ |
|
442 |
+ switch(action) { |
|
443 |
+ case RTF_OBJECT: |
|
444 |
+ state->encounteredTopLevel |= 1<<RTF_OBJECT; |
|
445 |
+ break; |
|
446 |
+ case RTF_OBJECT_DATA: |
|
447 |
+ if(state->encounteredTopLevel & (1<<RTF_OBJECT) ) { |
|
448 |
+ state->cb_begin = rtf_object_begin; |
|
449 |
+ state->cb_process = rtf_object_process; |
|
450 |
+ state->cb_end = rtf_object_end; |
|
451 |
+ } |
|
452 |
+ break; |
|
453 |
+ }; |
|
454 |
+} |
|
455 |
+ |
|
456 |
+static void cleanup_stack(struct stack* stack,struct rtf_state* state,cli_ctx* ctx) |
|
457 |
+{ |
|
458 |
+ while(stack && stack->stack_cnt && state->default_elements) { |
|
459 |
+ pop_state(stack,state); |
|
460 |
+ if(state->cb_begin) |
|
461 |
+ state->cb_end(state,ctx); |
|
462 |
+ } |
|
463 |
+} |
|
464 |
+ |
|
465 |
+ |
|
466 |
+#define SCAN_CLEANUP \ |
|
467 |
+ tableDestroy(actiontable);\ |
|
468 |
+ cleanup_stack(&stack,&state,ctx);\ |
|
469 |
+ free(buff);\ |
|
470 |
+ if(!cli_leavetemps_flag)\ |
|
471 |
+ cli_rmdirs(tempname);\ |
|
472 |
+ free(tempname);\ |
|
473 |
+ free(stack.states); |
|
474 |
+ |
|
475 |
+int cli_scanrtf(int desc, cli_ctx *ctx) |
|
476 |
+{ |
|
477 |
+ char* tempname; |
|
478 |
+ const unsigned char* ptr; |
|
479 |
+ const unsigned char* ptr_end; |
|
480 |
+ unsigned char* buff; |
|
481 |
+ int ret = CL_CLEAN; |
|
482 |
+ struct rtf_state state; |
|
483 |
+ struct stack stack; |
|
484 |
+ size_t bread; |
|
485 |
+ table_t* actiontable; |
|
486 |
+ uint8_t main_symbols[256]; |
|
487 |
+ |
|
488 |
+ cli_dbgmsg("in cli_scanrtf()\n"); |
|
489 |
+ |
|
490 |
+ memset(main_symbols, 0, 256); |
|
491 |
+ main_symbols['{']=1; |
|
492 |
+ main_symbols['}']=1; |
|
493 |
+ main_symbols['\\']=1; |
|
494 |
+ |
|
495 |
+ stack.stack_cnt = 0; |
|
496 |
+ stack.stack_size = 16; |
|
497 |
+ stack.elements = 0; |
|
498 |
+ stack.states = cli_malloc(stack.stack_size*sizeof(*stack.states)); |
|
499 |
+ |
|
500 |
+ if(!stack.states) |
|
501 |
+ return CL_EMEM; |
|
502 |
+ |
|
503 |
+ buff = cli_malloc(BUFF_SIZE); |
|
504 |
+ if(!buff) { |
|
505 |
+ free(stack.states); |
|
506 |
+ return CL_EMEM; |
|
507 |
+ } |
|
508 |
+ |
|
509 |
+ tempname = cli_gentemp(NULL); |
|
510 |
+ |
|
511 |
+ if(mkdir(tempname, 0700)) { |
|
512 |
+ cli_dbgmsg("ScanRTF -> Can't create temporary directory %s\n", tempname); |
|
513 |
+ free(stack.states); |
|
514 |
+ free(buff); |
|
515 |
+ free(tempname); |
|
516 |
+ return CL_ETMPDIR; |
|
517 |
+ } |
|
518 |
+ |
|
519 |
+ actiontable = tableCreate(); |
|
520 |
+ if((ret = load_actions(actiontable))) { |
|
521 |
+ cli_dbgmsg("RTF: Unable to load rtf action table\n"); |
|
522 |
+ free(stack.states); |
|
523 |
+ free(buff); |
|
524 |
+ if(!cli_leavetemps_flag) |
|
525 |
+ cli_rmdirs(tempname); |
|
526 |
+ free(tempname); |
|
527 |
+ return ret; |
|
528 |
+ } |
|
529 |
+ |
|
530 |
+ init_rtf_state(&state); |
|
531 |
+ |
|
532 |
+ while(( bread = cli_readn(desc, buff, BUFF_SIZE) )) { |
|
533 |
+ ptr = buff; |
|
534 |
+ ptr_end = buff + bread; |
|
535 |
+ while(ptr < ptr_end) { |
|
536 |
+ switch(state.parse_state) { |
|
537 |
+ case PARSE_MAIN: |
|
538 |
+ switch(*ptr++) { |
|
539 |
+ case '{': |
|
540 |
+ if(( ret = push_state(&stack,&state) )) { |
|
541 |
+ cli_dbgmsg("RTF:Push failure!\n"); |
|
542 |
+ SCAN_CLEANUP; |
|
543 |
+ return ret; |
|
544 |
+ } |
|
545 |
+ break; |
|
546 |
+ case '}': |
|
547 |
+ if(state.cb_data) |
|
548 |
+ if(( ret = state.cb_end(&state, ctx) )) { |
|
549 |
+ SCAN_CLEANUP; |
|
550 |
+ return ret; |
|
551 |
+ } |
|
552 |
+ if(( ret = pop_state(&stack,&state) )) { |
|
553 |
+ cli_dbgmsg("RTF:pop failure!\n"); |
|
554 |
+ SCAN_CLEANUP; |
|
555 |
+ return ret; |
|
556 |
+ } |
|
557 |
+ break; |
|
558 |
+ case '\\': |
|
559 |
+ state.parse_state = PARSE_CONTROL_; |
|
560 |
+ break; |
|
561 |
+ default: |
|
562 |
+ ptr--; |
|
563 |
+ { |
|
564 |
+ size_t i; |
|
565 |
+ size_t left = ptr_end - ptr; |
|
566 |
+ size_t use = left; |
|
567 |
+ for(i = 1;i < left; i++) |
|
568 |
+ if(main_symbols[ptr[i]]) { |
|
569 |
+ use = i; |
|
570 |
+ break; |
|
571 |
+ } |
|
572 |
+ if(state.cb_begin) { |
|
573 |
+ if(!state.cb_data) |
|
574 |
+ if(( ret = state.cb_begin(&state, ctx,tempname) )) { |
|
575 |
+ SCAN_CLEANUP; |
|
576 |
+ return ret; |
|
577 |
+ } |
|
578 |
+ if(( ret = state.cb_process(&state, ptr, use) )) { |
|
579 |
+ state.cb_end(&state,ctx); |
|
580 |
+ SCAN_CLEANUP; |
|
581 |
+ return ret; |
|
582 |
+ } |
|
583 |
+ } |
|
584 |
+ ptr += use; |
|
585 |
+ } |
|
586 |
+ } |
|
587 |
+ break; |
|
588 |
+ case PARSE_CONTROL_: |
|
589 |
+ if(isalpha(*ptr)) { |
|
590 |
+ state.parse_state = PARSE_CONTROL_WORD; |
|
591 |
+ state.controlword_cnt = 0; |
|
592 |
+ } |
|
593 |
+ else |
|
594 |
+ state.parse_state = PARSE_CONTROL_SYMBOL; |
|
595 |
+ break; |
|
596 |
+ case PARSE_CONTROL_SYMBOL: |
|
597 |
+ ptr++; /* Do nothing */ |
|
598 |
+ state.parse_state = PARSE_MAIN; |
|
599 |
+ break; |
|
600 |
+ case PARSE_CONTROL_WORD: |
|
601 |
+ if(state.controlword_cnt == 32) { |
|
602 |
+ cli_dbgmsg("Invalid control word: maximum size exceeded:%s\n",state.controlword); |
|
603 |
+ state.parse_state = PARSE_MAIN; |
|
604 |
+ } |
|
605 |
+ else if(isalpha(*ptr)) |
|
606 |
+ state.controlword[state.controlword_cnt++] = *ptr++; |
|
607 |
+ else { |
|
608 |
+ if(isspace(*ptr)) { |
|
609 |
+ state.controlword[state.controlword_cnt++] = *ptr++; |
|
610 |
+ state.parse_state = PARSE_INTERPRET_CONTROLWORD; |
|
611 |
+ } |
|
612 |
+ else if (isdigit(*ptr)) { |
|
613 |
+ state.parse_state = PARSE_CONTROL_WORD_PARAM; |
|
614 |
+ state.controlword_param = 0; |
|
615 |
+ state.controlword_param_sign = 1; |
|
616 |
+ } |
|
617 |
+ else if(*ptr == '-') { |
|
618 |
+ ptr++; |
|
619 |
+ state.parse_state = PARSE_CONTROL_WORD_PARAM; |
|
620 |
+ state.controlword_param = 0; |
|
621 |
+ state.controlword_param_sign = -1; |
|
622 |
+ } |
|
623 |
+ else { |
|
624 |
+ state.parse_state = PARSE_INTERPRET_CONTROLWORD; |
|
625 |
+ } |
|
626 |
+ } |
|
627 |
+ break; |
|
628 |
+ case PARSE_CONTROL_WORD_PARAM: |
|
629 |
+ if(isdigit(*ptr)) { |
|
630 |
+ state.controlword_param = state.controlword_param*10 + *ptr++ - '0'; |
|
631 |
+ } |
|
632 |
+ else if(isalpha(*ptr)) { |
|
633 |
+ ptr++; |
|
634 |
+ } |
|
635 |
+ else { |
|
636 |
+ if(state.controlword_param_sign < 0) |
|
637 |
+ state.controlword_param = -state.controlword_param; |
|
638 |
+ state.parse_state = PARSE_INTERPRET_CONTROLWORD; |
|
639 |
+ } |
|
640 |
+ break; |
|
641 |
+ case PARSE_INTERPRET_CONTROLWORD: |
|
642 |
+ { |
|
643 |
+ int action; |
|
644 |
+ |
|
645 |
+ state.controlword[state.controlword_cnt] = '\0'; |
|
646 |
+ action = tableFind(actiontable, state.controlword); |
|
647 |
+ if(action != -1) |
|
648 |
+ rtf_action(&state,action, tempname, ctx); |
|
649 |
+ state.parse_state = PARSE_MAIN; |
|
650 |
+ break; |
|
651 |
+ } |
|
652 |
+ } |
|
653 |
+ } |
|
654 |
+ } |
|
655 |
+ |
|
656 |
+ SCAN_CLEANUP; |
|
657 |
+ return ret; |
|
658 |
+} |
|
659 |
+ |
|
660 |
+#endif |
... | ... |
@@ -80,6 +80,10 @@ extern short cli_leavetemps_flag; |
80 | 80 |
#include "pdf.h" |
81 | 81 |
#include "str.h" |
82 | 82 |
|
83 |
+#ifdef CL_EXPERIMENTAL |
|
84 |
+#include "rtf.h" |
|
85 |
+#endif |
|
86 |
+ |
|
83 | 87 |
#ifdef HAVE_ZLIB_H |
84 | 88 |
#include <zlib.h> |
85 | 89 |
#include "unzip.h" |
... | ... |
@@ -1775,6 +1779,12 @@ int cli_magic_scandesc(int desc, cli_ctx *ctx) |
1775 | 1775 |
ret = cli_scanhtml_utf16(desc, ctx); |
1776 | 1776 |
break; |
1777 | 1777 |
|
1778 |
+#ifdef CL_EXPERIMENTAL |
|
1779 |
+ case CL_TYPE_RTF: |
|
1780 |
+ ret = cli_scanrtf(desc, ctx); |
|
1781 |
+ break; |
|
1782 |
+#endif |
|
1783 |
+ |
|
1778 | 1784 |
case CL_TYPE_MAIL: |
1779 | 1785 |
if(SCAN_MAIL) |
1780 | 1786 |
ret = cli_scanmail(desc, ctx); |