Browse code

pdfdecode: integrate lzw decompression

Kevin Lin authored on 2016/04/14 07:46:50
Showing 1 changed files
... ...
@@ -63,6 +63,7 @@
63 63
 #include "str.h"
64 64
 #include "bytecode.h"
65 65
 #include "bytecode_api.h"
66
+#include "lzw/lzwdec.h"
66 67
 
67 68
 struct pdf_token {
68 69
     uint32_t length;
... ...
@@ -77,7 +78,7 @@ static  int filter_rldecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct
77 77
 static  int filter_flatedecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token);
78 78
 static  int filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token);
79 79
 static  int filter_decrypt(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token, int mode);
80
-
80
+static  int filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token);
81 81
 
82 82
 off_t pdf_decodestream(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, const char *stream, uint32_t streamlen, int fout, int *rc)
83 83
 {
... ...
@@ -178,12 +179,15 @@ static int pdf_decodestream_internal(struct pdf_struct *pdf, struct pdf_obj *obj
178 178
             rc = filter_decrypt(pdf, obj, params, token, 0);
179 179
             break;
180 180
 
181
+        case OBJ_FILTER_LZW:
182
+            cli_dbgmsg("cli_pdf: decoding [%d] => LZWDECODE\n", obj->filterlist[i]);
183
+            rc = filter_lzwdecode(pdf, obj, params, token);
184
+            break;
185
+
181 186
         case OBJ_FILTER_JPX:
182 187
             if (!filter) filter = "JPXDECODE";
183 188
         case OBJ_FILTER_DCT:
184 189
             if (!filter) filter = "DCTDECODE";
185
-        case OBJ_FILTER_LZW:
186
-            if (!filter) filter = "LZWDECODE";
187 190
         case OBJ_FILTER_FAX:
188 191
             if (!filter) filter = "FAXDECODE";
189 192
         case OBJ_FILTER_JBIG2:
... ...
@@ -683,10 +687,12 @@ static int filter_decrypt(struct pdf_struct *pdf, struct pdf_obj *obj, struct pd
683 683
             if (node->type == PDF_DICT_STRING) {
684 684
                 if (!strncmp(node->key, "/Type", 6)) { /* optional field - Type */
685 685
                     /* MUST be "CryptFilterDecodeParms" */
686
-                    cli_dbgmsg("cli_pdf: Type: %s\n", (char *)(node->value));
686
+                    if (node->value)
687
+                        cli_dbgmsg("cli_pdf: Type: %s\n", (char *)(node->value));
687 688
                 } else if (!strncmp(node->key, "/Name", 6)) { /* optional field - Name */
688 689
                     /* overrides document and default encryption method */
689
-                    cli_dbgmsg("cli_pdf: Name: %s\n", (char *)(node->value));
690
+                    if (node->value)
691
+                        cli_dbgmsg("cli_pdf: Name: %s\n", (char *)(node->value));
690 692
                     enc = parse_enc_method(pdf->CF, pdf->CF_n, (char *)(node->value), enc);
691 693
                 }
692 694
             }
... ...
@@ -709,3 +715,171 @@ static int filter_decrypt(struct pdf_struct *pdf, struct pdf_obj *obj, struct pd
709 709
     token->length = (uint32_t)length; /* this may truncate unfortunately, TODO: use 64-bit values internally? */
710 710
     return CL_SUCCESS;
711 711
 }
712
+
713
+static int filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token)
714
+{
715
+    uint8_t *decoded, *temp;
716
+    uint32_t declen = 0, capacity = 0;
717
+
718
+    uint8_t *content = (uint8_t *)token->content;
719
+    uint32_t length = token->length;
720
+    lzw_stream stream;
721
+    int echg = 1, lzwstat, skip = 0, rc = CL_SUCCESS;
722
+
723
+    if (params) {
724
+        struct pdf_dict_node *node = params->nodes;
725
+
726
+        while (node) {
727
+            if (node->type == PDF_DICT_STRING) {
728
+                if (!strncmp(node->key, "/EarlyChange", 13)) { /* optional field - lzw flag */
729
+                    char *end, *value = (char *)node->value;
730
+                    long set;
731
+
732
+                    if (value) {
733
+                        cli_dbgmsg("cli_pdf: EarlyChange: %s\n", value);
734
+                        set = strtol(value, &end, 10);
735
+                        if (end != value)
736
+                            echg = (int)set;
737
+                    }
738
+                }
739
+            }
740
+            node = node->next;
741
+        }
742
+    }
743
+
744
+    if (*content == '\r') {
745
+        content++;
746
+        length--;
747
+        pdfobj_flag(pdf, obj, BAD_STREAMSTART);
748
+        /* PDF spec says stream is followed by \r\n or \n, but not \r alone.
749
+         * Sample 0015315109, it has \r followed by zlib header.
750
+         * Flag pdf as suspicious, and attempt to extract by skipping the \r.
751
+         */
752
+        if (!length)
753
+            return CL_SUCCESS;
754
+    }
755
+
756
+    if (!(decoded = (uint8_t *)cli_calloc(BUFSIZ, sizeof(uint8_t)))) {
757
+        cli_errmsg("cli_pdf: cannot allocate memory for decoded output\n");
758
+        return CL_EMEM;
759
+    }
760
+    capacity = BUFSIZ;
761
+
762
+    memset(&stream, 0, sizeof(stream));
763
+    stream.next_in = content;
764
+    stream.avail_in = length;
765
+    stream.next_out = decoded;
766
+    stream.avail_out = BUFSIZ;
767
+
768
+    lzwstat = lzwInit(&stream, echg ? LZW_FLAG_EARLYCHG : LZW_NOFLAGS);
769
+    if(lzwstat != Z_OK) {
770
+        cli_warnmsg("cli_pdf: lzwInit failed\n");
771
+        free(decoded);
772
+        return CL_EMEM;
773
+    }
774
+
775
+    /* initial inflate */
776
+    lzwstat = lzwInflate(&stream);
777
+    /* check if nothing written whatsoever */
778
+    if ((lzwstat != Z_OK) && (stream.avail_out == BUFSIZ)) {
779
+        /* skip till EOL, and try inflating from there, sometimes
780
+         * PDFs contain extra whitespace */
781
+        uint8_t *q = decode_nextlinestart(content, length);
782
+        if (q) {
783
+            (void)lzwInflateEnd(&stream);
784
+            length -= q - content;
785
+            content = q;
786
+
787
+            stream.next_in = (Bytef *)content;
788
+            stream.avail_in = length;
789
+            stream.next_out = (Bytef *)decoded;
790
+            stream.avail_out = capacity;
791
+
792
+            lzwstat = lzwInit(&stream, echg ? LZW_FLAG_EARLYCHG : LZW_NOFLAGS);
793
+            if(lzwstat != Z_OK) {
794
+                cli_warnmsg("cli_pdf: lzwInit failed\n");
795
+                free(decoded);
796
+                return CL_EMEM;
797
+            }
798
+
799
+            pdfobj_flag(pdf, obj, BAD_FLATESTART);
800
+        }
801
+
802
+        lzwstat = lzwInflate(&stream);
803
+    }
804
+
805
+    while (lzwstat == Z_OK && stream.avail_in) {
806
+        /* extend output capacity if needed,*/
807
+        if(stream.avail_out == 0) {
808
+            if ((rc = cli_checklimits("pdf", pdf->ctx, capacity+BUFSIZ, 0, 0)) != CL_SUCCESS)
809
+                break;
810
+
811
+            if (!(temp = cli_realloc(decoded, capacity + BUFSIZ))) {
812
+                cli_errmsg("cli_pdf: cannot reallocate memory for decoded output\n");
813
+                rc = CL_EMEM;
814
+                break;
815
+            }
816
+            decoded = temp;
817
+            stream.next_out = decoded + capacity;
818
+            stream.avail_out = BUFSIZ;
819
+            declen += BUFSIZ;
820
+            capacity += BUFSIZ;
821
+        }
822
+
823
+        /* continue inflation */
824
+        lzwstat = lzwInflate(&stream);
825
+    }
826
+
827
+    /* add stream end fragment to decoded length */
828
+    declen += (BUFSIZ - stream.avail_out);
829
+
830
+    /* error handling */
831
+    switch(lzwstat) {
832
+    case LZW_OK:
833
+        cli_dbgmsg("cli_pdf: LZW_OK on stream inflation completion\n");
834
+        /* intentional fall-through */
835
+    case LZW_STREAM_END:
836
+        cli_dbgmsg("cli_pdf: inflated %lu bytes from %lu total bytes (%lu bytes remaining)\n",
837
+                   (unsigned long)declen, (unsigned long)(token->length), (unsigned long)(stream.avail_in));
838
+        break;
839
+
840
+    /* potentially fatal - *mostly* ignored as per older version */
841
+    case LZW_STREAM_ERROR:
842
+    case LZW_DATA_ERROR:
843
+    case LZW_MEM_ERROR:
844
+    case LZW_BUF_ERROR:
845
+    case LZW_DICT_ERROR:
846
+    default:
847
+        if(stream.msg)
848
+            cli_dbgmsg("cli_pdf: after writing %lu bytes, got error \"%s\" inflating PDF stream in %u %u obj\n",
849
+                       (unsigned long)declen, stream.msg, obj->id>>8, obj->id&0xff);
850
+        else
851
+            cli_dbgmsg("cli_pdf: after writing %lu bytes, got error %d inflating PDF stream in %u %u obj\n",
852
+                       (unsigned long)declen, lzwstat, obj->id>>8, obj->id&0xff);
853
+
854
+        if (declen == 0) {
855
+            pdfobj_flag(pdf, obj, BAD_FLATESTART);
856
+            cli_dbgmsg("cli_pdf: no bytes were inflated.\n");
857
+
858
+            rc = CL_EFORMAT;
859
+        } else {
860
+            pdfobj_flag(pdf, obj, BAD_FLATE);
861
+        }
862
+        break;
863
+    }
864
+
865
+    (void)lzwInflateEnd(&stream);
866
+
867
+    if (rc == CL_SUCCESS) {
868
+        free(token->content);
869
+
870
+        token->content = decoded;
871
+        token->length = declen;
872
+    } else {
873
+        cli_errmsg("cli_pdf: error occurred parsing byte %lu of %lu\n",
874
+                   (unsigned long)(length-stream.avail_in), (unsigned long)(token->length));
875
+        free(decoded);
876
+    }
877
+
878
+    return rc;
879
+}