Browse code

more javascript dumping fix.

Török Edvin authored on 2012/01/19 03:58:38
Showing 1 changed files
... ...
@@ -827,6 +827,8 @@ static int pdf_scan_contents(int fd, struct pdf_struct *pdf)
827 827
 static const char *pdf_getdict(const char *q0, int* len, const char *key);
828 828
 static char *pdf_readval(const char *q, int len, const char *key);
829 829
 static enum enc_method parse_enc_method(const char *dict, unsigned len, const char *key, enum enc_method def);
830
+static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen, const char **qend, int noescape);
831
+
830 832
 static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
831 833
 {
832 834
     char fullname[NAME_MAX + 1];
... ...
@@ -1002,91 +1004,53 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
1002 1002
 	    break;
1003 1003
 
1004 1004
       do {
1005
+        char *js = NULL;
1006
+        off_t js_len = 0;
1005 1007
 
1006 1008
 	q2 = cli_memstr(q, bytesleft, "/JavaScript", 11);
1007 1009
 	if (!q2)
1008 1010
 	    break;
1009 1011
 	bytesleft -= q2 - q;
1010
-	do {
1011
-	q2++;
1012
-	bytesleft--;
1013
-	q = pdf_nextobject(q2, bytesleft);
1014
-	if (!q)
1015
-	    break;
1016
-	bytesleft -= q - q2;
1017
-	q2 = q;
1018
-	} while (*q == '/');
1019
-	if (!q)
1020
-	    break;
1021
-
1022
-	if (*q == '(') {
1023
-	    const char *out, *q2, *end;
1024
-	    long n = bytesleft;
1025
-	    q++;
1026
-	    n--;
1027
-	    out = q;
1028
-	    end = q + n;
1029
-
1030
-	    do {
1031
-		q2 = memchr(q, ')', n);
1032
-		if (q2) {
1033
-		    q2++;
1034
-		    n -= q2 - q;
1035
-		    q = q2;
1036
-		}
1037
-	    } while (n > 0 && q2 && q2[-2] == '\\');
1038
-	    if (q2)
1039
-		end = q2-1;
1040
-	    n = end - out;
1041
-	    bytesleft -= q - out;
1042
-
1043
-	    if (pdf->flags & (1 << DECRYPTABLE_PDF)) {
1044
-		cli_dbgmsg("cli_pdf: encrypted string\n");
1045
-		decrypted = decrypt_any(pdf, obj->id, out, &n,
1012
+        q = q2 + 11;
1013
+
1014
+        js = pdf_readstring(q, bytesleft,  "/JS", NULL, &q2, !(pdf->flags & (1<<DECRYPTABLE_PDF)));
1015
+        bytesleft -= q2 - q;
1016
+        q = q2;
1017
+
1018
+        if (js) {
1019
+            const char *out = js;
1020
+            js_len = strlen(js);
1021
+            if (pdf->flags & (1 << DECRYPTABLE_PDF)) {
1022
+                cli_dbgmsg("cli_pdf: encrypted string\n");
1023
+		decrypted = decrypt_any(pdf, obj->id, js, &js_len,
1046 1024
 					pdf->enc_method_string);
1047 1025
 		if (decrypted) {
1048 1026
 		    noisy_msg(pdf, "decrypted Javascript string from obj %u %u\n", obj->id>>8,obj->id&0xff);
1049 1027
 		    out = decrypted;
1050 1028
 		}
1051 1029
 	    }
1052
-	    if (filter_writen(pdf, obj, fout, out, n, &sum) != n) {
1030
+
1031
+	    if (filter_writen(pdf, obj, fout, out, js_len, &sum) != js_len) {
1053 1032
 		rc = CL_EWRITE;
1033
+                free(js);
1054 1034
 		break;
1055 1035
 	    }
1036
+            free(js);
1056 1037
 	    cli_dbgmsg("bytesleft: %d\n", bytesleft);
1057
-	} else if (*q == '<') {
1058
-	    char *decoded;
1059
-	    const char *out;
1060
-	    long n;
1061
-	    q2 = memchr(q+1, '>', bytesleft);
1062
-	    if (!q2) q2 = q + bytesleft;
1063
-	    n = q2 - q;
1064
-	    out = q;
1065
-	    q += n;
1066
-	    bytesleft -= n;
1067
-	    n--;
1068
-	    if (pdf->flags & (1 << DECRYPTABLE_PDF)) {
1069
-		out++;
1070
-		n--;
1071
-		decrypted = decrypt_any(pdf, obj->id, out, &n, pdf->enc_method_string);
1072
-		if (decrypted) {
1073
-		    noisy_msg(pdf, "decrypted Javascript string from obj %u %u\n", obj->id>>8,obj->id&0xff);
1074
-		    out = decrypted;
1075
-		}
1076
-	    }
1077
-	    decoded = cli_malloc(n);
1078
-	    if (!decoded) {
1079
-		rc = CL_EMEM;
1080
-		break;
1081
-	    }
1082
-	    cli_hex2str_to(out, decoded, n-1);
1083
-	    decoded[n-1] = '\0';
1084
-	    cli_dbgmsg("cli_pdf: found hexadecimal encoded javascript in %u %u obj\n",
1085
-		       obj->id>>8, obj->id&0xff);
1086
-	    pdfobj_flag(pdf, obj, HEX_JAVASCRIPT);
1087
-	    filter_writen(pdf, obj, fout, decoded, n-1, &sum);
1088
-	    free(decoded);
1089
-	}
1038
+
1039
+            q2 = pdf_nextobject(q, bytesleft);
1040
+            if (!q2) q2 = q + bytesleft - 1;
1041
+            /* non-conforming PDFs that don't escape ) properly */
1042
+            const char *q3 = memchr(q, ')', bytesleft);
1043
+            if (q3 && q3 < q2) q2 = q3;
1044
+            while (q2 > q && q2[-1] == ' ') q2--;
1045
+            if (q2 > q) {
1046
+                q--;
1047
+                filter_writen(pdf, obj, fout, q, q2 - q, &sum);
1048
+                q++;
1049
+            }
1050
+        }
1051
+
1090 1052
       } while (bytesleft > 0);
1091 1053
     } else {
1092 1054
 	off_t bytesleft = obj_size(pdf, obj, 0);
... ...
@@ -1236,7 +1200,6 @@ static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj,
1236 1236
     }
1237 1237
 }
1238 1238
 
1239
-static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen);
1240 1239
 static int pdf_readint(const char *q0, int len, const char *key);
1241 1240
 
1242 1241
 static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len)
... ...
@@ -1280,7 +1243,7 @@ static void pdf_parse_trailer(struct pdf_struct *pdf, const char *s, long length
1280 1280
 	char *newID;
1281 1281
 	pdf->flags |= 1 << ENCRYPTED_PDF;
1282 1282
 	pdf_parse_encrypt(pdf, enc, s + length - enc);
1283
-	newID = pdf_readstring(s, length, "/ID", &pdf->fileIDlen);
1283
+	newID = pdf_readstring(s, length, "/ID", &pdf->fileIDlen, NULL, 0);
1284 1284
 	if (newID) {
1285 1285
 	    free(pdf->fileID);
1286 1286
 	    pdf->fileID = newID;
... ...
@@ -1477,12 +1440,14 @@ static const char *pdf_getdict(const char *q0, int* len, const char *key)
1477 1477
     return q;
1478 1478
 }
1479 1479
 
1480
-static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen)
1480
+static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen, const char **qend, int noescape)
1481 1481
 {
1482 1482
     char *s, *s0;
1483 1483
     const char *start, *q, *end;
1484 1484
     if (slen)
1485 1485
 	*slen = 0;
1486
+    if (qend)
1487
+        *qend = q0;
1486 1488
     q = pdf_getdict(q0, &len, key);
1487 1489
     if (!q)
1488 1490
 	return NULL;
... ...
@@ -1505,12 +1470,18 @@ static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *
1505 1505
 		    break;
1506 1506
 	    }
1507 1507
 	}
1508
+        if (qend)
1509
+            *qend = q;
1508 1510
 	q--;
1509 1511
 	len  = q - start;
1510 1512
 	s0 = s = cli_malloc(len + 1);
1511 1513
 	if (!s)
1512 1514
 	    return NULL;
1513 1515
 	end = start + len;
1516
+        if (noescape) {
1517
+            memcpy(s0, start, len);
1518
+            s = s0 + len;
1519
+        } else {
1514 1520
 	for (q = start;q < end;q++) {
1515 1521
 	    if (*q != '\\') {
1516 1522
 		*s++ = *q;
... ...
@@ -1564,11 +1535,13 @@ static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *
1564 1564
 			break;
1565 1565
 		    default:
1566 1566
 			/* ignore */
1567
-			q--;
1567
+                        *s++ = '\\';
1568
+                        q--;
1568 1569
 			break;
1569 1570
 		}
1570 1571
 	    }
1571 1572
 	}
1573
+        }
1572 1574
 	*s++ = '\0';
1573 1575
 	if (slen)
1574 1576
 	    *slen = s - s0 - 1;
... ...
@@ -1579,6 +1552,8 @@ static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *
1579 1579
 	q = memchr(q+1, '>', len);
1580 1580
 	if (!q)
1581 1581
 	    return NULL;
1582
+        if (qend)
1583
+            *qend = q;
1582 1584
 	s = cli_malloc((q - start)/2 + 1);
1583 1585
 	cli_hex2str_to(start, s, q - start);
1584 1586
 	s[(q-start)/2] = '\0';
... ...
@@ -1893,7 +1868,7 @@ static void pdf_handle_enc(struct pdf_struct *pdf)
1893 1893
 		length = 128;
1894 1894
 	    else {
1895 1895
 		n = 0;
1896
-		UE = pdf_readstring(q, len, "/UE", &n);
1896
+		UE = pdf_readstring(q, len, "/UE", &n, NULL, 0);
1897 1897
 		length = 256;
1898 1898
 	    }
1899 1899
 	}
... ...
@@ -1901,7 +1876,7 @@ static void pdf_handle_enc(struct pdf_struct *pdf)
1901 1901
 	    length = 40;
1902 1902
 
1903 1903
 	n = 0;
1904
-	O = pdf_readstring(q, len, "/O", &n);
1904
+	O = pdf_readstring(q, len, "/O", &n, NULL, 0);
1905 1905
 	if (!O || n < oulen) {
1906 1906
 	    cli_dbgmsg("cli_pdf: invalid O: %d\n", n);
1907 1907
 	    cli_dbgmsg("cli_pdf: invalid O: %d\n", n);
... ...
@@ -1921,7 +1896,7 @@ static void pdf_handle_enc(struct pdf_struct *pdf)
1921 1921
 	}
1922 1922
 
1923 1923
 	n = 0;
1924
-	U = pdf_readstring(q, len, "/U", &n);
1924
+	U = pdf_readstring(q, len, "/U", &n, NULL, 0);
1925 1925
 	if (!U || n < oulen) {
1926 1926
 	    cli_dbgmsg("cli_pdf: invalid U: %d\n", n);
1927 1927
 	    noisy_warnmsg("cli_pdf: invalid U: %d\n", n);