Browse code

xmlreader unterminated char entity pre-processing

Kevin Lin authored on 2015/04/01 02:30:28
Showing 1 changed files
... ...
@@ -78,7 +78,18 @@ static const struct key_entry msxml_keys[] = {
78 78
 };
79 79
 static size_t num_msxml_keys = sizeof(msxml_keys) / sizeof(struct key_entry);
80 80
 
81
+enum msxml_state {
82
+    MSXML_STATE_NORMAL = 0,
83
+    MSXML_STATE_ENTITY_START_1,
84
+    MSXML_STATE_ENTITY_START_2,
85
+    MSXML_STATE_ENTITY_HEX,
86
+    MSXML_STATE_ENTITY_DEC,
87
+    MSXML_STATE_ENTITY_CLOSE,
88
+    MSXML_STATE_ENTITY_NONE
89
+};
90
+
81 91
 struct msxml_cbdata {
92
+    enum msxml_state state;
82 93
     fmap_t *map;
83 94
     const unsigned char *window;
84 95
     off_t winpos, mappos;
... ...
@@ -145,8 +156,14 @@ int msxml_read_cb(void *ctx, char *buffer, int len)
145 145
     wbytes = 0;
146 146
     rbytes = cbdata->winsize - cbdata->winpos;
147 147
 
148
+    /* copying loop with preprocessing */
148 149
     while (wbytes < len) {
149
-        size_t written = MIN(rbytes, len);
150
+        const unsigned char *read_from;
151
+        char *write_to = buffer + wbytes;
152
+        enum msxml_state *state;
153
+#if MSXML_VERBIOSE
154
+        size_t written;
155
+#endif
150 156
 
151 157
         if (!rbytes) {
152 158
             if ((winret = msxml_read_cb_new_window(cbdata)) < 0)
... ...
@@ -159,16 +176,68 @@ int msxml_read_cb(void *ctx, char *buffer, int len)
159 159
             rbytes = cbdata->winsize;
160 160
         }
161 161
 
162
+#if MSXML_VERBIOSE
162 163
         written = MIN(rbytes, len - wbytes);
163
-
164
-        cli_msxmlmsg("msxml_read_cb: copying from window [%llu(+%llu)] %llu->%llu\n",
164
+        cli_msxmlmsg("msxml_read_cb: copying from window [%llu(+%llu)] %llu->~%llu\n",
165 165
                      (long long unsigned)(cbdata->winsize - rbytes), (long long unsigned)cbdata->winsize,
166 166
                      (long long unsigned)cbdata->winpos, (long long unsigned)(cbdata->winpos + written));
167
+#endif
167 168
 
168
-        memcpy(buffer + wbytes, cbdata->window + cbdata->winpos, written);
169
+        read_from = cbdata->window + cbdata->winpos;
170
+        state = &(cbdata->state);
171
+
172
+        while (rbytes > 0 && wbytes < len) {
173
+            switch (*state) {
174
+            case MSXML_STATE_NORMAL:
175
+                if ((*read_from) == '&')
176
+                    *state = MSXML_STATE_ENTITY_START_1;
177
+                break;
178
+            case MSXML_STATE_ENTITY_START_1:
179
+                if ((*read_from) == '#')
180
+                    *state = MSXML_STATE_ENTITY_START_2;
181
+                else
182
+                    *state = MSXML_STATE_NORMAL;
183
+                break;
184
+            case MSXML_STATE_ENTITY_START_2:
185
+                if ((*read_from) == 'x')
186
+                    *state = MSXML_STATE_ENTITY_HEX;
187
+                else if (((*read_from) >= '0') && ((*read_from) <= '9'))
188
+                    *state = MSXML_STATE_ENTITY_DEC;
189
+                else
190
+                    *state = MSXML_STATE_NORMAL;
191
+                break;
192
+            case MSXML_STATE_ENTITY_HEX:
193
+                if ((((*read_from) >= '0') && ((*read_from) <= '9')) ||
194
+                    (((*read_from) >= 'a') && ((*read_from) <= 'f')) ||
195
+                    (((*read_from) >= 'A') && ((*read_from) <= 'F'))) {}
196
+                else
197
+                    *state = MSXML_STATE_ENTITY_CLOSE;
198
+                break;
199
+            case MSXML_STATE_ENTITY_DEC:
200
+                if (((*read_from) >= '0') && ((*read_from) <= '9')) {}
201
+                else
202
+                    *state = MSXML_STATE_ENTITY_CLOSE;
203
+                break;
204
+            default:
205
+                cli_errmsg("unknown *state: %d\n", *state);
206
+            }
169 207
 
170
-        wbytes += written;
171
-        rbytes -= written;
208
+            if (*state == MSXML_STATE_ENTITY_CLOSE) {
209
+                if ((*read_from) != ';') {
210
+                    cli_msxmlmsg("msxml_read_cb: detected unterminated character entity @ winoff %d\n",
211
+                                 (int)(read_from - cbdata->window));
212
+                    (*write_to++) = ';';
213
+                    wbytes++;
214
+                }
215
+                *state = MSXML_STATE_NORMAL;
216
+                if (wbytes >= len)
217
+                    break;
218
+            }
219
+
220
+            *(write_to++) = *(read_from++);
221
+            rbytes--;
222
+            wbytes++;
223
+        }
172 224
     }
173 225
 
174 226
     cbdata->winpos = cbdata->winsize - rbytes;