Browse code

ffprobe: implement string validation setting

This should fix trac tickets #1163, #2502.

Stefano Sabatini authored on 2013/10/02 23:22:17
Showing 3 changed files
... ...
@@ -7,6 +7,7 @@ version <next>
7 7
 - Live HDS muxer
8 8
 - setsar/setdar filters now support variables in ratio expressions
9 9
 - elbg filter
10
+- string validation in ffprobe
10 11
 
11 12
 
12 13
 version 2.1:
... ...
@@ -337,6 +337,39 @@ A writer may accept one or more arguments, which specify the options
337 337
 to adopt. The options are specified as a list of @var{key}=@var{value}
338 338
 pairs, separated by ":".
339 339
 
340
+All writers support the following options:
341
+
342
+@table @option
343
+@item string_validation, sv
344
+Set string validation mode.
345
+
346
+The following values are accepted.
347
+@table @samp
348
+@item fail
349
+The writer will fail immediately in case an invalid string (UTF-8)
350
+sequence or code point is found in the input. This is especially
351
+useful to validate input metadata.
352
+
353
+@item ignore
354
+Any validation error will be ignored. This will result in possibly
355
+broken output, especially with the json or xml writer.
356
+
357
+@item replace
358
+The writer will substitute invalid UTF-8 sequences or code points with
359
+the string specified with the @option{string_validation_replacement}.
360
+@end table
361
+
362
+Default value is @samp{replace}.
363
+
364
+@item string_validation_replacement, svr
365
+Set replacement string to use in case @option{string_validation} is
366
+set to @samp{replace}.
367
+
368
+In case the option is not specified, the writer will assume the empty
369
+string, that is it will remove the invalid sequences from the input
370
+strings.
371
+@end table
372
+
340 373
 A description of the currently available writers follows.
341 374
 
342 375
 @section default
... ...
@@ -258,6 +258,13 @@ typedef struct WriterContext WriterContext;
258 258
 #define WRITER_FLAG_DISPLAY_OPTIONAL_FIELDS 1
259 259
 #define WRITER_FLAG_PUT_PACKETS_AND_FRAMES_IN_SAME_CHAPTER 2
260 260
 
261
+typedef enum {
262
+    WRITER_STRING_VALIDATION_FAIL,
263
+    WRITER_STRING_VALIDATION_REPLACE,
264
+    WRITER_STRING_VALIDATION_IGNORE,
265
+    WRITER_STRING_VALIDATION_NB,
266
+} StringValidation;
267
+
261 268
 typedef struct Writer {
262 269
     const AVClass *priv_class;      ///< private class of the writer, if any
263 270
     int priv_size;                  ///< private size for the writer context
... ...
@@ -298,6 +305,10 @@ struct WriterContext {
298 298
     unsigned int nb_section_packet; ///< number of the packet section in case we are in "packets_and_frames" section
299 299
     unsigned int nb_section_frame;  ///< number of the frame  section in case we are in "packets_and_frames" section
300 300
     unsigned int nb_section_packet_frame; ///< nb_section_packet or nb_section_frame according if is_packets_and_frames
301
+
302
+    StringValidation string_validation;
303
+    char *string_validation_replacement;
304
+    unsigned int string_validation_utf8_flags;
301 305
 };
302 306
 
303 307
 static const char *writer_get_name(void *p)
... ...
@@ -308,6 +319,19 @@ static const char *writer_get_name(void *p)
308 308
 
309 309
 #define OFFSET(x) offsetof(WriterContext, x)
310 310
 
311
+static const AVOption writer_options[] = {
312
+    { "string_validation", "set string validation mode",
313
+      OFFSET(string_validation), AV_OPT_TYPE_INT, {.i64=WRITER_STRING_VALIDATION_REPLACE}, 0, WRITER_STRING_VALIDATION_NB-1, .unit = "sv" },
314
+    { "sv", "set string validation mode",
315
+      OFFSET(string_validation), AV_OPT_TYPE_INT, {.i64=WRITER_STRING_VALIDATION_REPLACE}, 0, WRITER_STRING_VALIDATION_NB-1, .unit = "sv" },
316
+    { "ignore",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = WRITER_STRING_VALIDATION_IGNORE},  .unit = "sv" },
317
+    { "replace", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = WRITER_STRING_VALIDATION_REPLACE}, .unit = "sv" },
318
+    { "fail",    NULL, 0, AV_OPT_TYPE_CONST, {.i64 = WRITER_STRING_VALIDATION_FAIL},    .unit = "sv" },
319
+    { "string_validation_replacement", "set string validation replacement string", OFFSET(string_validation_replacement), AV_OPT_TYPE_STRING, {.str=""}},
320
+    { "svr", "set string validation replacement string", OFFSET(string_validation_replacement), AV_OPT_TYPE_STRING, {.str=""}},
321
+    { NULL },
322
+};
323
+
311 324
 static void *writer_child_next(void *obj, void *prev)
312 325
 {
313 326
     WriterContext *ctx = obj;
... ...
@@ -321,6 +345,7 @@ static const AVClass writer_class = {
321 321
     writer_get_name,
322 322
     NULL,
323 323
     LIBAVUTIL_VERSION_INT,
324
+    .option = writer_options,
324 325
     .child_next = writer_child_next,
325 326
 };
326 327
 
... ...
@@ -341,6 +366,15 @@ static void writer_close(WriterContext **wctx)
341 341
     av_freep(wctx);
342 342
 }
343 343
 
344
+static void bprint_bytes(AVBPrint *bp, const uint8_t *ubuf, size_t ubuf_size)
345
+{
346
+    int i;
347
+    av_bprintf(bp, "0X");
348
+    for (i = 0; i < ubuf_size; i++)
349
+        av_bprintf(bp, "%02X", ubuf[i]);
350
+}
351
+
352
+
344 353
 static int writer_open(WriterContext **wctx, const Writer *writer, const char *args,
345 354
                        const struct section *sections, int nb_sections)
346 355
 {
... ...
@@ -393,6 +427,26 @@ static int writer_open(WriterContext **wctx, const Writer *writer, const char *a
393 393
         av_dict_free(&opts);
394 394
     }
395 395
 
396
+    /* validate replace string */
397
+    {
398
+        const uint8_t *p = (*wctx)->string_validation_replacement;
399
+        const uint8_t *endp = p + strlen(p);
400
+        while (*p) {
401
+            const uint8_t *p0 = p;
402
+            int32_t code;
403
+            ret = av_utf8_decode(&code, &p, endp, (*wctx)->string_validation_utf8_flags);
404
+            if (ret < 0) {
405
+                AVBPrint bp;
406
+                av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC);
407
+                bprint_bytes(&bp, p0, p-p0),
408
+                    av_log(wctx, AV_LOG_ERROR,
409
+                           "Invalid UTF8 sequence %s found in string validation replace '%s'\n",
410
+                           bp.str, (*wctx)->string_validation_replacement);
411
+                return ret;
412
+            }
413
+        }
414
+    }
415
+
396 416
     for (i = 0; i < SECTION_MAX_NB_LEVELS; i++)
397 417
         av_bprint_init(&(*wctx)->section_pbuf[i], 1, AV_BPRINT_SIZE_UNLIMITED);
398 418
 
... ...
@@ -460,17 +514,94 @@ static inline void writer_print_integer(WriterContext *wctx,
460 460
     }
461 461
 }
462 462
 
463
+static inline int validate_string(WriterContext *wctx, char **dstp, const char *src)
464
+{
465
+    const uint8_t *p, *endp;
466
+    AVBPrint dstbuf;
467
+    int invalid_chars_nb = 0, ret = 0;
468
+
469
+    av_bprint_init(&dstbuf, 0, AV_BPRINT_SIZE_UNLIMITED);
470
+
471
+    endp = src + strlen(src);
472
+    for (p = (uint8_t *)src; *p;) {
473
+        uint32_t code;
474
+        int invalid = 0;
475
+        const uint8_t *p0 = p;
476
+
477
+        if (av_utf8_decode(&code, &p, endp, wctx->string_validation_utf8_flags) < 0) {
478
+            AVBPrint bp;
479
+            av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC);
480
+            bprint_bytes(&bp, p0, p-p0);
481
+            av_log(wctx, AV_LOG_DEBUG,
482
+                   "Invalid UTF-8 sequence %s found in string '%s'\n", bp.str, src);
483
+            invalid = 1;
484
+        }
485
+
486
+        if (invalid) {
487
+            invalid_chars_nb++;
488
+
489
+            switch (wctx->string_validation) {
490
+            case WRITER_STRING_VALIDATION_FAIL:
491
+                av_log(wctx, AV_LOG_ERROR,
492
+                       "Invalid UTF-8 sequence found in string '%s'\n", src);
493
+                ret = AVERROR_INVALIDDATA;
494
+                goto end;
495
+                break;
496
+
497
+            case WRITER_STRING_VALIDATION_REPLACE:
498
+                av_bprintf(&dstbuf, "%s", wctx->string_validation_replacement);
499
+                break;
500
+            }
501
+        }
502
+
503
+        if (!invalid || wctx->string_validation == WRITER_STRING_VALIDATION_IGNORE)
504
+            av_bprint_append_data(&dstbuf, p0, p-p0);
505
+    }
506
+
507
+    if (invalid_chars_nb && wctx->string_validation == WRITER_STRING_VALIDATION_REPLACE) {
508
+        av_log(wctx, AV_LOG_WARNING,
509
+               "%d invalid UTF-8 sequence(s) found in string '%s', replaced with '%s'\n",
510
+               invalid_chars_nb, src, wctx->string_validation_replacement);
511
+    }
512
+
513
+end:
514
+    av_bprint_finalize(&dstbuf, dstp);
515
+    return ret;
516
+}
517
+
518
+#define PRINT_STRING_OPT      1
519
+#define PRINT_STRING_VALIDATE 2
520
+
463 521
 static inline int writer_print_string(WriterContext *wctx,
464
-                                      const char *key, const char *val, int opt)
522
+                                      const char *key, const char *val, int flags)
465 523
 {
466 524
     const struct section *section = wctx->section[wctx->level];
467 525
     int ret = 0;
468 526
 
469
-    if (opt && !(wctx->writer->flags & WRITER_FLAG_DISPLAY_OPTIONAL_FIELDS))
527
+    if ((flags & PRINT_STRING_OPT)
528
+        && !(wctx->writer->flags & WRITER_FLAG_DISPLAY_OPTIONAL_FIELDS))
470 529
         return 0;
471 530
 
472 531
     if (section->show_all_entries || av_dict_get(section->entries_to_show, key, NULL, 0)) {
473
-        wctx->writer->print_string(wctx, key, val);
532
+        if (flags & PRINT_STRING_VALIDATE) {
533
+            char *key1 = NULL, *val1 = NULL;
534
+            ret = validate_string(wctx, &key1, key);
535
+            if (ret < 0) goto end;
536
+            ret = validate_string(wctx, &val1, val);
537
+            if (ret < 0) goto end;
538
+            wctx->writer->print_string(wctx, key1, val1);
539
+        end:
540
+            if (ret < 0) {
541
+                av_log(wctx, AV_LOG_ERROR,
542
+                       "Invalid key=value string combination %s=%s in section %s\n",
543
+                       key, val, section->unique_name);
544
+            }
545
+            av_free(key1);
546
+            av_free(val1);
547
+        } else {
548
+            wctx->writer->print_string(wctx, key, val);
549
+        }
550
+
474 551
         wctx->nb_item[wctx->level]++;
475 552
     }
476 553
 
... ...
@@ -492,7 +623,7 @@ static void writer_print_time(WriterContext *wctx, const char *key,
492 492
     char buf[128];
493 493
 
494 494
     if ((!is_duration && ts == AV_NOPTS_VALUE) || (is_duration && ts == 0)) {
495
-        writer_print_string(wctx, key, "N/A", 1);
495
+        writer_print_string(wctx, key, "N/A", PRINT_STRING_OPT);
496 496
     } else {
497 497
         double d = ts * av_q2d(*time_base);
498 498
         struct unit_value uv;
... ...
@@ -506,7 +637,7 @@ static void writer_print_time(WriterContext *wctx, const char *key,
506 506
 static void writer_print_ts(WriterContext *wctx, const char *key, int64_t ts, int is_duration)
507 507
 {
508 508
     if ((!is_duration && ts == AV_NOPTS_VALUE) || (is_duration && ts == 0)) {
509
-        writer_print_string(wctx, key, "N/A", 1);
509
+        writer_print_string(wctx, key, "N/A", PRINT_STRING_OPT);
510 510
     } else {
511 511
         writer_print_integer(wctx, key, ts);
512 512
     }
... ...
@@ -1476,7 +1607,8 @@ static void writer_register_all(void)
1476 1476
 #define print_int(k, v)         writer_print_integer(w, k, v)
1477 1477
 #define print_q(k, v, s)        writer_print_rational(w, k, v, s)
1478 1478
 #define print_str(k, v)         writer_print_string(w, k, v, 0)
1479
-#define print_str_opt(k, v)     writer_print_string(w, k, v, 1)
1479
+#define print_str_opt(k, v)     writer_print_string(w, k, v, PRINT_STRING_OPT)
1480
+#define print_str_validate(k, v) writer_print_string(w, k, v, PRINT_STRING_VALIDATE)
1480 1481
 #define print_time(k, v, tb)    writer_print_time(w, k, v, tb, 0)
1481 1482
 #define print_ts(k, v)          writer_print_ts(w, k, v, 0)
1482 1483
 #define print_duration_time(k, v, tb) writer_print_time(w, k, v, tb, 1)
... ...
@@ -1491,21 +1623,20 @@ static void writer_register_all(void)
1491 1491
 #define print_section_header(s) writer_print_section_header(w, s)
1492 1492
 #define print_section_footer(s) writer_print_section_footer(w, s)
1493 1493
 
1494
-static inline int show_tags(WriterContext *wctx, AVDictionary *tags, int section_id)
1494
+static inline int show_tags(WriterContext *w, AVDictionary *tags, int section_id)
1495 1495
 {
1496 1496
     AVDictionaryEntry *tag = NULL;
1497 1497
     int ret = 0;
1498 1498
 
1499 1499
     if (!tags)
1500 1500
         return 0;
1501
-    writer_print_section_header(wctx, section_id);
1501
+    writer_print_section_header(w, section_id);
1502 1502
 
1503 1503
     while ((tag = av_dict_get(tags, "", tag, AV_DICT_IGNORE_SUFFIX))) {
1504
-        ret = writer_print_string(wctx, tag->key, tag->value, 0);
1505
-        if (ret < 0)
1504
+        if ((ret = print_str_validate(tag->key, tag->value)) < 0)
1506 1505
             break;
1507 1506
     }
1508
-    writer_print_section_footer(wctx);
1507
+    writer_print_section_footer(w);
1509 1508
 
1510 1509
     return ret;
1511 1510
 }
... ...
@@ -2054,7 +2185,7 @@ static int show_format(WriterContext *w, AVFormatContext *fmt_ctx)
2054 2054
     int ret = 0;
2055 2055
 
2056 2056
     writer_print_section_header(w, SECTION_ID_FORMAT);
2057
-    print_str("filename",         fmt_ctx->filename);
2057
+    print_str_validate("filename", fmt_ctx->filename);
2058 2058
     print_int("nb_streams",       fmt_ctx->nb_streams);
2059 2059
     print_int("nb_programs",      fmt_ctx->nb_programs);
2060 2060
     print_str("format_name",      fmt_ctx->iformat->name);
... ...
@@ -2755,6 +2886,9 @@ int main(int argc, char **argv)
2755 2755
 
2756 2756
     if ((ret = writer_open(&wctx, w, w_args,
2757 2757
                            sections, FF_ARRAY_ELEMS(sections))) >= 0) {
2758
+        if (w == &xml_writer)
2759
+            wctx->string_validation_utf8_flags |= AV_UTF8_FLAG_EXCLUDE_XML_INVALID_CONTROL_CODES;
2760
+
2758 2761
         writer_print_section_header(wctx, SECTION_ID_ROOT);
2759 2762
 
2760 2763
         if (do_show_program_version)