Browse code

idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall. Includes mmx2 asm for the various functions. Note that the actual idct still does not have an x86 SIMD implemtation. For wmv3 files using regular idct, the decoder just falls back to simple_idct, since simple_idct_dc doesn't exist (yet).

Originally committed as revision 19204 to svn://svn.ffmpeg.org/ffmpeg/trunk

Jason Garrett-Glaser authored on 2009/06/16 18:00:55
Showing 5 changed files
... ...
@@ -486,6 +486,10 @@ typedef struct DSPContext {
486 486
     void (*vc1_inv_trans_8x4)(uint8_t *dest, int line_size, DCTELEM *block);
487 487
     void (*vc1_inv_trans_4x8)(uint8_t *dest, int line_size, DCTELEM *block);
488 488
     void (*vc1_inv_trans_4x4)(uint8_t *dest, int line_size, DCTELEM *block);
489
+    void (*vc1_inv_trans_8x8_dc)(uint8_t *dest, int line_size, DCTELEM *block);
490
+    void (*vc1_inv_trans_8x4_dc)(uint8_t *dest, int line_size, DCTELEM *block);
491
+    void (*vc1_inv_trans_4x8_dc)(uint8_t *dest, int line_size, DCTELEM *block);
492
+    void (*vc1_inv_trans_4x4_dc)(uint8_t *dest, int line_size, DCTELEM *block);
489 493
     void (*vc1_v_overlap)(uint8_t* src, int stride);
490 494
     void (*vc1_h_overlap)(uint8_t* src, int stride);
491 495
     void (*vc1_v_loop_filter4)(uint8_t *src, int stride, int pq);
... ...
@@ -337,6 +337,10 @@ int vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitConte
337 337
         v->s.dsp.vc1_inv_trans_8x4 = ff_simple_idct84_add;
338 338
         v->s.dsp.vc1_inv_trans_4x8 = ff_simple_idct48_add;
339 339
         v->s.dsp.vc1_inv_trans_4x4 = ff_simple_idct44_add;
340
+        v->s.dsp.vc1_inv_trans_8x8_dc = ff_simple_idct_add;
341
+        v->s.dsp.vc1_inv_trans_8x4_dc = ff_simple_idct84_add;
342
+        v->s.dsp.vc1_inv_trans_4x8_dc = ff_simple_idct48_add;
343
+        v->s.dsp.vc1_inv_trans_4x4_dc = ff_simple_idct44_add;
340 344
     }
341 345
 
342 346
     v->fastuvmc =  get_bits1(gb); //common
... ...
@@ -2028,8 +2028,12 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan
2028 2028
                 block[idx] += (block[idx] < 0) ? -mquant : mquant;
2029 2029
         }
2030 2030
         if(!skip_block){
2031
-            s->dsp.vc1_inv_trans_8x8(block);
2032
-            s->dsp.add_pixels_clamped(block, dst, linesize);
2031
+            if(i==1)
2032
+                s->dsp.vc1_inv_trans_8x8_dc(dst, linesize, block);
2033
+            else{
2034
+                s->dsp.vc1_inv_trans_8x8(block);
2035
+                s->dsp.add_pixels_clamped(block, dst, linesize);
2036
+            }
2033 2037
             if(apply_filter && cbp_top  & 0xC)
2034 2038
                 s->dsp.vc1_v_loop_filter8(dst, linesize, v->pq);
2035 2039
             if(apply_filter && cbp_left & 0xA)
... ...
@@ -2053,7 +2057,10 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan
2053 2053
                     block[idx + off] += (block[idx + off] < 0) ? -mquant : mquant;
2054 2054
             }
2055 2055
             if(!(subblkpat & (1 << (3 - j))) && !skip_block){
2056
-                s->dsp.vc1_inv_trans_4x4(dst + (j&1)*4 + (j&2)*2*linesize, linesize, block + off);
2056
+                if(i==1)
2057
+                    s->dsp.vc1_inv_trans_4x4_dc(dst + (j&1)*4 + (j&2)*2*linesize, linesize, block + off);
2058
+                else
2059
+                    s->dsp.vc1_inv_trans_4x4(dst + (j&1)*4 + (j&2)*2*linesize, linesize, block + off);
2057 2060
                 if(apply_filter && (j&2 ? pat & (1<<(j-2)) : (cbp_top & (1 << (j + 2)))))
2058 2061
                     s->dsp.vc1_v_loop_filter4(dst + (j&1)*4 + (j&2)*2*linesize, linesize, v->pq);
2059 2062
                 if(apply_filter && (j&1 ? pat & (1<<(j-1)) : (cbp_left & (1 << (j + 1)))))
... ...
@@ -2078,7 +2085,10 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan
2078 2078
                     block[idx] += (block[idx] < 0) ? -mquant : mquant;
2079 2079
             }
2080 2080
             if(!(subblkpat & (1 << (1 - j))) && !skip_block){
2081
-                s->dsp.vc1_inv_trans_8x4(dst + j*4*linesize, linesize, block + off);
2081
+                if(i==1)
2082
+                    s->dsp.vc1_inv_trans_8x4_dc(dst + j*4*linesize, linesize, block + off);
2083
+                else
2084
+                    s->dsp.vc1_inv_trans_8x4(dst + j*4*linesize, linesize, block + off);
2082 2085
                 if(apply_filter && j ? pat & 0x3 : (cbp_top & 0xC))
2083 2086
                     s->dsp.vc1_v_loop_filter8(dst + j*4*linesize, linesize, v->pq);
2084 2087
                 if(apply_filter && cbp_left & (2 << j))
... ...
@@ -2103,7 +2113,10 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan
2103 2103
                     block[idx] += (block[idx] < 0) ? -mquant : mquant;
2104 2104
             }
2105 2105
             if(!(subblkpat & (1 << (1 - j))) && !skip_block){
2106
-                s->dsp.vc1_inv_trans_4x8(dst + j*4, linesize, block + off);
2106
+                if(i==1)
2107
+                    s->dsp.vc1_inv_trans_4x8_dc(dst + j*4, linesize, block + off);
2108
+                else
2109
+                    s->dsp.vc1_inv_trans_4x8(dst + j*4, linesize, block + off);
2107 2110
                 if(apply_filter && cbp_top & (2 << j))
2108 2111
                     s->dsp.vc1_v_loop_filter4(dst + j*4, linesize, v->pq);
2109 2112
                 if(apply_filter && j ? pat & 0x5 : (cbp_left & 0xA))
... ...
@@ -178,6 +178,26 @@ static void vc1_h_loop_filter16_c(uint8_t *src, int stride, int pq)
178 178
 
179 179
 /** Do inverse transform on 8x8 block
180 180
 */
181
+static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
182
+{
183
+    int i;
184
+    int dc = block[0];
185
+    const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
186
+    dc = (3 * dc +  1) >> 1;
187
+    dc = (3 * dc + 16) >> 5;
188
+    for(i = 0; i < 8; i++){
189
+        dest[0] = cm[dest[0]+dc];
190
+        dest[1] = cm[dest[1]+dc];
191
+        dest[2] = cm[dest[2]+dc];
192
+        dest[3] = cm[dest[3]+dc];
193
+        dest[4] = cm[dest[4]+dc];
194
+        dest[5] = cm[dest[5]+dc];
195
+        dest[6] = cm[dest[6]+dc];
196
+        dest[7] = cm[dest[7]+dc];
197
+        dest += linesize;
198
+    }
199
+}
200
+
181 201
 static void vc1_inv_trans_8x8_c(DCTELEM block[64])
182 202
 {
183 203
     int i;
... ...
@@ -249,6 +269,26 @@ static void vc1_inv_trans_8x8_c(DCTELEM block[64])
249 249
 
250 250
 /** Do inverse transform on 8x4 part of block
251 251
 */
252
+static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
253
+{
254
+    int i;
255
+    int dc = block[0];
256
+    const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
257
+    dc = ( 3 * dc +  1) >> 1;
258
+    dc = (17 * dc + 64) >> 7;
259
+    for(i = 0; i < 4; i++){
260
+        dest[0] = cm[dest[0]+dc];
261
+        dest[1] = cm[dest[1]+dc];
262
+        dest[2] = cm[dest[2]+dc];
263
+        dest[3] = cm[dest[3]+dc];
264
+        dest[4] = cm[dest[4]+dc];
265
+        dest[5] = cm[dest[5]+dc];
266
+        dest[6] = cm[dest[6]+dc];
267
+        dest[7] = cm[dest[7]+dc];
268
+        dest += linesize;
269
+    }
270
+}
271
+
252 272
 static void vc1_inv_trans_8x4_c(uint8_t *dest, int linesize, DCTELEM *block)
253 273
 {
254 274
     int i;
... ...
@@ -306,6 +346,22 @@ static void vc1_inv_trans_8x4_c(uint8_t *dest, int linesize, DCTELEM *block)
306 306
 
307 307
 /** Do inverse transform on 4x8 parts of block
308 308
 */
309
+static void vc1_inv_trans_4x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
310
+{
311
+    int i;
312
+    int dc = block[0];
313
+    const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
314
+    dc = (17 * dc +  4) >> 3;
315
+    dc = (12 * dc + 64) >> 7;
316
+    for(i = 0; i < 8; i++){
317
+        dest[0] = cm[dest[0]+dc];
318
+        dest[1] = cm[dest[1]+dc];
319
+        dest[2] = cm[dest[2]+dc];
320
+        dest[3] = cm[dest[3]+dc];
321
+        dest += linesize;
322
+    }
323
+}
324
+
309 325
 static void vc1_inv_trans_4x8_c(uint8_t *dest, int linesize, DCTELEM *block)
310 326
 {
311 327
     int i;
... ...
@@ -363,6 +419,22 @@ static void vc1_inv_trans_4x8_c(uint8_t *dest, int linesize, DCTELEM *block)
363 363
 
364 364
 /** Do inverse transform on 4x4 part of block
365 365
 */
366
+static void vc1_inv_trans_4x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
367
+{
368
+    int i;
369
+    int dc = block[0];
370
+    const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
371
+    dc = (17 * dc +  4) >> 3;
372
+    dc = (17 * dc + 64) >> 7;
373
+    for(i = 0; i < 4; i++){
374
+        dest[0] = cm[dest[0]+dc];
375
+        dest[1] = cm[dest[1]+dc];
376
+        dest[2] = cm[dest[2]+dc];
377
+        dest[3] = cm[dest[3]+dc];
378
+        dest += linesize;
379
+    }
380
+}
381
+
366 382
 static void vc1_inv_trans_4x4_c(uint8_t *dest, int linesize, DCTELEM *block)
367 383
 {
368 384
     int i;
... ...
@@ -545,6 +617,10 @@ void ff_vc1dsp_init(DSPContext* dsp, AVCodecContext *avctx) {
545 545
     dsp->vc1_inv_trans_4x8 = vc1_inv_trans_4x8_c;
546 546
     dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_c;
547 547
     dsp->vc1_inv_trans_4x4 = vc1_inv_trans_4x4_c;
548
+    dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_c;
549
+    dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_c;
550
+    dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_c;
551
+    dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_c;
548 552
     dsp->vc1_h_overlap = vc1_h_overlap_c;
549 553
     dsp->vc1_v_overlap = vc1_v_overlap_c;
550 554
     dsp->vc1_v_loop_filter4 = vc1_v_loop_filter4_c;
... ...
@@ -494,6 +494,204 @@ DECLARE_FUNCTION(3, 1)
494 494
 DECLARE_FUNCTION(3, 2)
495 495
 DECLARE_FUNCTION(3, 3)
496 496
 
497
+static void vc1_inv_trans_4x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
498
+{
499
+    int dc = block[0];
500
+    dc = (17 * dc +  4) >> 3;
501
+    dc = (17 * dc + 64) >> 7;
502
+    __asm__ volatile(
503
+        "movd          %0, %%mm0 \n\t"
504
+        "pshufw $0, %%mm0, %%mm0 \n\t"
505
+        "pxor       %%mm1, %%mm1 \n\t"
506
+        "psubw      %%mm0, %%mm1 \n\t"
507
+        "packuswb   %%mm0, %%mm0 \n\t"
508
+        "packuswb   %%mm1, %%mm1 \n\t"
509
+        ::"r"(dc)
510
+    );
511
+    __asm__ volatile(
512
+        "movd          %0, %%mm2 \n\t"
513
+        "movd          %1, %%mm3 \n\t"
514
+        "movd          %2, %%mm4 \n\t"
515
+        "movd          %3, %%mm5 \n\t"
516
+        "paddusb    %%mm0, %%mm2 \n\t"
517
+        "paddusb    %%mm0, %%mm3 \n\t"
518
+        "paddusb    %%mm0, %%mm4 \n\t"
519
+        "paddusb    %%mm0, %%mm5 \n\t"
520
+        "psubusb    %%mm1, %%mm2 \n\t"
521
+        "psubusb    %%mm1, %%mm3 \n\t"
522
+        "psubusb    %%mm1, %%mm4 \n\t"
523
+        "psubusb    %%mm1, %%mm5 \n\t"
524
+        "movd       %%mm2, %0    \n\t"
525
+        "movd       %%mm3, %1    \n\t"
526
+        "movd       %%mm4, %2    \n\t"
527
+        "movd       %%mm5, %3    \n\t"
528
+        :"+m"(*(uint32_t*)(dest+0*linesize)),
529
+         "+m"(*(uint32_t*)(dest+1*linesize)),
530
+         "+m"(*(uint32_t*)(dest+2*linesize)),
531
+         "+m"(*(uint32_t*)(dest+3*linesize))
532
+    );
533
+}
534
+
535
+static void vc1_inv_trans_4x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
536
+{
537
+    int dc = block[0];
538
+    dc = (17 * dc +  4) >> 3;
539
+    dc = (12 * dc + 64) >> 7;
540
+    __asm__ volatile(
541
+        "movd          %0, %%mm0 \n\t"
542
+        "pshufw $0, %%mm0, %%mm0 \n\t"
543
+        "pxor       %%mm1, %%mm1 \n\t"
544
+        "psubw      %%mm0, %%mm1 \n\t"
545
+        "packuswb   %%mm0, %%mm0 \n\t"
546
+        "packuswb   %%mm1, %%mm1 \n\t"
547
+        ::"r"(dc)
548
+    );
549
+    __asm__ volatile(
550
+        "movd          %0, %%mm2 \n\t"
551
+        "movd          %1, %%mm3 \n\t"
552
+        "movd          %2, %%mm4 \n\t"
553
+        "movd          %3, %%mm5 \n\t"
554
+        "paddusb    %%mm0, %%mm2 \n\t"
555
+        "paddusb    %%mm0, %%mm3 \n\t"
556
+        "paddusb    %%mm0, %%mm4 \n\t"
557
+        "paddusb    %%mm0, %%mm5 \n\t"
558
+        "psubusb    %%mm1, %%mm2 \n\t"
559
+        "psubusb    %%mm1, %%mm3 \n\t"
560
+        "psubusb    %%mm1, %%mm4 \n\t"
561
+        "psubusb    %%mm1, %%mm5 \n\t"
562
+        "movd       %%mm2, %0    \n\t"
563
+        "movd       %%mm3, %1    \n\t"
564
+        "movd       %%mm4, %2    \n\t"
565
+        "movd       %%mm5, %3    \n\t"
566
+        :"+m"(*(uint32_t*)(dest+0*linesize)),
567
+         "+m"(*(uint32_t*)(dest+1*linesize)),
568
+         "+m"(*(uint32_t*)(dest+2*linesize)),
569
+         "+m"(*(uint32_t*)(dest+3*linesize))
570
+    );
571
+    dest += 4*linesize;
572
+    __asm__ volatile(
573
+        "movd          %0, %%mm2 \n\t"
574
+        "movd          %1, %%mm3 \n\t"
575
+        "movd          %2, %%mm4 \n\t"
576
+        "movd          %3, %%mm5 \n\t"
577
+        "paddusb    %%mm0, %%mm2 \n\t"
578
+        "paddusb    %%mm0, %%mm3 \n\t"
579
+        "paddusb    %%mm0, %%mm4 \n\t"
580
+        "paddusb    %%mm0, %%mm5 \n\t"
581
+        "psubusb    %%mm1, %%mm2 \n\t"
582
+        "psubusb    %%mm1, %%mm3 \n\t"
583
+        "psubusb    %%mm1, %%mm4 \n\t"
584
+        "psubusb    %%mm1, %%mm5 \n\t"
585
+        "movd       %%mm2, %0    \n\t"
586
+        "movd       %%mm3, %1    \n\t"
587
+        "movd       %%mm4, %2    \n\t"
588
+        "movd       %%mm5, %3    \n\t"
589
+        :"+m"(*(uint32_t*)(dest+0*linesize)),
590
+         "+m"(*(uint32_t*)(dest+1*linesize)),
591
+         "+m"(*(uint32_t*)(dest+2*linesize)),
592
+         "+m"(*(uint32_t*)(dest+3*linesize))
593
+    );
594
+}
595
+
596
+static void vc1_inv_trans_8x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
597
+{
598
+    int dc = block[0];
599
+    dc = ( 3 * dc +  1) >> 1;
600
+    dc = (17 * dc + 64) >> 7;
601
+    __asm__ volatile(
602
+        "movd          %0, %%mm0 \n\t"
603
+        "pshufw $0, %%mm0, %%mm0 \n\t"
604
+        "pxor       %%mm1, %%mm1 \n\t"
605
+        "psubw      %%mm0, %%mm1 \n\t"
606
+        "packuswb   %%mm0, %%mm0 \n\t"
607
+        "packuswb   %%mm1, %%mm1 \n\t"
608
+        ::"r"(dc)
609
+    );
610
+    __asm__ volatile(
611
+        "movq          %0, %%mm2 \n\t"
612
+        "movq          %1, %%mm3 \n\t"
613
+        "movq          %2, %%mm4 \n\t"
614
+        "movq          %3, %%mm5 \n\t"
615
+        "paddusb    %%mm0, %%mm2 \n\t"
616
+        "paddusb    %%mm0, %%mm3 \n\t"
617
+        "paddusb    %%mm0, %%mm4 \n\t"
618
+        "paddusb    %%mm0, %%mm5 \n\t"
619
+        "psubusb    %%mm1, %%mm2 \n\t"
620
+        "psubusb    %%mm1, %%mm3 \n\t"
621
+        "psubusb    %%mm1, %%mm4 \n\t"
622
+        "psubusb    %%mm1, %%mm5 \n\t"
623
+        "movq       %%mm2, %0    \n\t"
624
+        "movq       %%mm3, %1    \n\t"
625
+        "movq       %%mm4, %2    \n\t"
626
+        "movq       %%mm5, %3    \n\t"
627
+        :"+m"(*(uint32_t*)(dest+0*linesize)),
628
+         "+m"(*(uint32_t*)(dest+1*linesize)),
629
+         "+m"(*(uint32_t*)(dest+2*linesize)),
630
+         "+m"(*(uint32_t*)(dest+3*linesize))
631
+    );
632
+}
633
+
634
+static void vc1_inv_trans_8x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
635
+{
636
+    int dc = block[0];
637
+    dc = (3 * dc +  1) >> 1;
638
+    dc = (3 * dc + 16) >> 5;
639
+    __asm__ volatile(
640
+        "movd          %0, %%mm0 \n\t"
641
+        "pshufw $0, %%mm0, %%mm0 \n\t"
642
+        "pxor       %%mm1, %%mm1 \n\t"
643
+        "psubw      %%mm0, %%mm1 \n\t"
644
+        "packuswb   %%mm0, %%mm0 \n\t"
645
+        "packuswb   %%mm1, %%mm1 \n\t"
646
+        ::"r"(dc)
647
+    );
648
+    __asm__ volatile(
649
+        "movq          %0, %%mm2 \n\t"
650
+        "movq          %1, %%mm3 \n\t"
651
+        "movq          %2, %%mm4 \n\t"
652
+        "movq          %3, %%mm5 \n\t"
653
+        "paddusb    %%mm0, %%mm2 \n\t"
654
+        "paddusb    %%mm0, %%mm3 \n\t"
655
+        "paddusb    %%mm0, %%mm4 \n\t"
656
+        "paddusb    %%mm0, %%mm5 \n\t"
657
+        "psubusb    %%mm1, %%mm2 \n\t"
658
+        "psubusb    %%mm1, %%mm3 \n\t"
659
+        "psubusb    %%mm1, %%mm4 \n\t"
660
+        "psubusb    %%mm1, %%mm5 \n\t"
661
+        "movq       %%mm2, %0    \n\t"
662
+        "movq       %%mm3, %1    \n\t"
663
+        "movq       %%mm4, %2    \n\t"
664
+        "movq       %%mm5, %3    \n\t"
665
+        :"+m"(*(uint32_t*)(dest+0*linesize)),
666
+         "+m"(*(uint32_t*)(dest+1*linesize)),
667
+         "+m"(*(uint32_t*)(dest+2*linesize)),
668
+         "+m"(*(uint32_t*)(dest+3*linesize))
669
+    );
670
+    dest += 4*linesize;
671
+    __asm__ volatile(
672
+        "movq          %0, %%mm2 \n\t"
673
+        "movq          %1, %%mm3 \n\t"
674
+        "movq          %2, %%mm4 \n\t"
675
+        "movq          %3, %%mm5 \n\t"
676
+        "paddusb    %%mm0, %%mm2 \n\t"
677
+        "paddusb    %%mm0, %%mm3 \n\t"
678
+        "paddusb    %%mm0, %%mm4 \n\t"
679
+        "paddusb    %%mm0, %%mm5 \n\t"
680
+        "psubusb    %%mm1, %%mm2 \n\t"
681
+        "psubusb    %%mm1, %%mm3 \n\t"
682
+        "psubusb    %%mm1, %%mm4 \n\t"
683
+        "psubusb    %%mm1, %%mm5 \n\t"
684
+        "movq       %%mm2, %0    \n\t"
685
+        "movq       %%mm3, %1    \n\t"
686
+        "movq       %%mm4, %2    \n\t"
687
+        "movq       %%mm5, %3    \n\t"
688
+        :"+m"(*(uint32_t*)(dest+0*linesize)),
689
+         "+m"(*(uint32_t*)(dest+1*linesize)),
690
+         "+m"(*(uint32_t*)(dest+2*linesize)),
691
+         "+m"(*(uint32_t*)(dest+3*linesize))
692
+    );
693
+}
694
+
497 695
 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
498 696
     mm_flags = mm_support();
499 697
 
... ...
@@ -537,5 +735,10 @@ void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
537 537
         dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2;
538 538
         dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2;
539 539
         dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2;
540
+
541
+        dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2;
542
+        dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2;
543
+        dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2;
544
+        dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2;
540 545
     }
541 546
 }