Browse code

forced pdf json strings to be utf-8 or base64 encoded

Kevin Lin authored on 2015/03/03 09:06:23
Showing 3 changed files
... ...
@@ -3511,22 +3511,86 @@ static void pdf_export_json(struct pdf_struct *pdf)
3511 3511
         goto cleanup;
3512 3512
     }
3513 3513
 
3514
-    if (pdf->stats.author)
3515
-        cli_jsonstr(pdfobj, "Author", pdf->stats.author);
3516
-    if (pdf->stats.creator)
3517
-        cli_jsonstr(pdfobj, "Creator", pdf->stats.creator);
3518
-    if (pdf->stats.producer)
3519
-        cli_jsonstr(pdfobj, "Producer", pdf->stats.producer);
3520
-    if (pdf->stats.modificationdate)
3521
-        cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate);
3522
-    if (pdf->stats.creationdate)
3523
-        cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate);
3524
-    if (pdf->stats.title)
3525
-        cli_jsonstr(pdfobj, "Title", pdf->stats.title);
3526
-    if (pdf->stats.subject)
3527
-        cli_jsonstr(pdfobj, "Subject", pdf->stats.subject);
3528
-    if (pdf->stats.keywords)
3529
-        cli_jsonstr(pdfobj, "Keywords", pdf->stats.keywords);
3514
+    if (pdf->stats.author) {
3515
+        if (cli_isutf8(pdf->stats.author, strlen(pdf->stats.author)))
3516
+            cli_jsonstr(pdfobj, "Author", pdf->stats.author);
3517
+        else {
3518
+            char *b64 = (char *)cl_base64_encode(pdf->stats.author, strlen(pdf->stats.author));
3519
+            cli_jsonstr(pdfobj, "Author", b64);
3520
+            cli_jsonbool(pdfobj, "Author_base64", 1);
3521
+            free(b64);
3522
+        }
3523
+    }
3524
+    if (pdf->stats.creator) {
3525
+        if (cli_isutf8(pdf->stats.creator, strlen(pdf->stats.creator)))
3526
+            cli_jsonstr(pdfobj, "Creator", pdf->stats.creator);
3527
+        else {
3528
+            char *b64 = (char *)cl_base64_encode(pdf->stats.creator, strlen(pdf->stats.creator));
3529
+            cli_jsonstr(pdfobj, "Creator", b64);
3530
+            cli_jsonbool(pdfobj, "Creator_base64", 1);
3531
+            free(b64);
3532
+        }
3533
+    }
3534
+    if (pdf->stats.producer) {
3535
+        if (cli_isutf8(pdf->stats.producer, strlen(pdf->stats.producer)))
3536
+            cli_jsonstr(pdfobj, "Producer", pdf->stats.producer);
3537
+        else {
3538
+            char *b64 = (char *)cl_base64_encode(pdf->stats.producer, strlen(pdf->stats.producer));
3539
+            cli_jsonstr(pdfobj, "Producer", b64);
3540
+            cli_jsonbool(pdfobj, "Producer_base64", 1);
3541
+            free(b64);
3542
+        }
3543
+    }
3544
+    if (pdf->stats.modificationdate) {
3545
+        if (cli_isutf8(pdf->stats.modificationdate, strlen(pdf->stats.modificationdate)))
3546
+            cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate);
3547
+        else {
3548
+            char *b64 = (char *)cl_base64_encode(pdf->stats.modificationdate, strlen(pdf->stats.modificationdate));
3549
+            cli_jsonstr(pdfobj, "ModificationDate", b64);
3550
+            cli_jsonbool(pdfobj, "ModificationDate_base64", 1);
3551
+            free(b64);
3552
+        }
3553
+    }
3554
+    if (pdf->stats.creationdate) {
3555
+        if (cli_isutf8(pdf->stats.creationdate, strlen(pdf->stats.creationdate)))
3556
+            cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate);
3557
+        else {
3558
+            char *b64 = (char *)cl_base64_encode(pdf->stats.creationdate, strlen(pdf->stats.creationdate));
3559
+            cli_jsonstr(pdfobj, "CreationDate", b64);
3560
+            cli_jsonbool(pdfobj, "CreationDate_base64", 1);
3561
+            free(b64);
3562
+        }
3563
+    }
3564
+    if (pdf->stats.title) {
3565
+        if (cli_isutf8(pdf->stats.title, strlen(pdf->stats.title)))
3566
+            cli_jsonstr(pdfobj, "Title", pdf->stats.title);
3567
+        else {
3568
+            char *b64 = (char *)cl_base64_encode(pdf->stats.title, strlen(pdf->stats.title));
3569
+            cli_jsonstr(pdfobj, "Title", b64);
3570
+            cli_jsonbool(pdfobj, "Title_base64", 1);
3571
+            free(b64);
3572
+        }
3573
+    }
3574
+    if (pdf->stats.subject) {
3575
+        if (cli_isutf8(pdf->stats.subject, strlen(pdf->stats.subject)))
3576
+            cli_jsonstr(pdfobj, "Subject", pdf->stats.subject);
3577
+        else {
3578
+            char *b64 = (char *)cl_base64_encode(pdf->stats.subject, strlen(pdf->stats.subject));
3579
+            cli_jsonstr(pdfobj, "Subject", b64);
3580
+            cli_jsonbool(pdfobj, "Subject_base64", 1);
3581
+            free(b64);
3582
+        }
3583
+    }
3584
+    if (pdf->stats.keywords) {
3585
+        if (cli_isutf8(pdf->stats.keywords, strlen(pdf->stats.keywords)))
3586
+            cli_jsonstr(pdfobj, "Keywords", pdf->stats.keywords);
3587
+        else {
3588
+            char *b64 = (char *)cl_base64_encode(pdf->stats.keywords, strlen(pdf->stats.keywords));
3589
+            cli_jsonstr(pdfobj, "Keywords", b64);
3590
+            cli_jsonbool(pdfobj, "Keywords_base64", 1);
3591
+            free(b64);
3592
+        }
3593
+    }
3530 3594
     if (pdf->stats.ninvalidobjs)
3531 3595
         cli_jsonint(pdfobj, "InvalidObjectCount", pdf->stats.ninvalidobjs);
3532 3596
     if (pdf->stats.njs)
... ...
@@ -666,3 +666,49 @@ char *cli_utf16_to_utf8(const char *utf16, size_t length, utf16_type type)
666 666
     s2[j] = '\0';
667 667
     return s2;
668 668
 }
669
+
670
+int cli_isutf8(const unsigned char *buf, unsigned int len)
671
+{
672
+	unsigned int i, j;
673
+
674
+    for(i = 0; i < len; i++) {
675
+        if((buf[i] & 0x80) == 0) {  /* 0xxxxxxx is plain ASCII */
676
+            continue;
677
+        } else if((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
678
+            return 0;
679
+        } else {
680
+            unsigned int following;
681
+
682
+            if((buf[i] & 0x20) == 0) {		/* 110xxxxx */
683
+                /* c = buf[i] & 0x1f; */
684
+                following = 1;
685
+            } else if((buf[i] & 0x10) == 0) {	/* 1110xxxx */
686
+                /* c = buf[i] & 0x0f; */
687
+                following = 2;
688
+            } else if((buf[i] & 0x08) == 0) {	/* 11110xxx */
689
+                /* c = buf[i] & 0x07; */
690
+                following = 3;
691
+            } else if((buf[i] & 0x04) == 0) {	/* 111110xx */
692
+                /* c = buf[i] & 0x03; */
693
+                following = 4;
694
+            } else if((buf[i] & 0x02) == 0) {	/* 1111110x */
695
+                /* c = buf[i] & 0x01; */
696
+                following = 5;
697
+            } else {
698
+                return 0;
699
+            }
700
+
701
+            for(j = 0; j < following; j++) {
702
+                if(++i >= len)
703
+                    return 0;
704
+
705
+                if((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
706
+                    return 0;
707
+
708
+                /* c = (c << 6) + (buf[i] & 0x3f); */
709
+            }
710
+        }
711
+    }
712
+
713
+    return 1;
714
+}
... ...
@@ -58,5 +58,7 @@ typedef enum {
58 58
 } utf16_type;
59 59
 char *cli_utf16_to_utf8(const char *utf16, size_t length, utf16_type type);
60 60
 
61
+int cli_isutf8(const unsigned char *buf, unsigned int len);
62
+
61 63
 size_t cli_strlcat(char *dst, const char *src, size_t sz); /* libclamav/strlcat.c */
62 64
 #endif