Originally committed as revision 19204 to svn://svn.ffmpeg.org/ffmpeg/trunk
Jason Garrett-Glaser authored on 2009/06/16 18:00:55... | ... |
@@ -486,6 +486,10 @@ typedef struct DSPContext { |
486 | 486 |
void (*vc1_inv_trans_8x4)(uint8_t *dest, int line_size, DCTELEM *block); |
487 | 487 |
void (*vc1_inv_trans_4x8)(uint8_t *dest, int line_size, DCTELEM *block); |
488 | 488 |
void (*vc1_inv_trans_4x4)(uint8_t *dest, int line_size, DCTELEM *block); |
489 |
+ void (*vc1_inv_trans_8x8_dc)(uint8_t *dest, int line_size, DCTELEM *block); |
|
490 |
+ void (*vc1_inv_trans_8x4_dc)(uint8_t *dest, int line_size, DCTELEM *block); |
|
491 |
+ void (*vc1_inv_trans_4x8_dc)(uint8_t *dest, int line_size, DCTELEM *block); |
|
492 |
+ void (*vc1_inv_trans_4x4_dc)(uint8_t *dest, int line_size, DCTELEM *block); |
|
489 | 493 |
void (*vc1_v_overlap)(uint8_t* src, int stride); |
490 | 494 |
void (*vc1_h_overlap)(uint8_t* src, int stride); |
491 | 495 |
void (*vc1_v_loop_filter4)(uint8_t *src, int stride, int pq); |
... | ... |
@@ -337,6 +337,10 @@ int vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitConte |
337 | 337 |
v->s.dsp.vc1_inv_trans_8x4 = ff_simple_idct84_add; |
338 | 338 |
v->s.dsp.vc1_inv_trans_4x8 = ff_simple_idct48_add; |
339 | 339 |
v->s.dsp.vc1_inv_trans_4x4 = ff_simple_idct44_add; |
340 |
+ v->s.dsp.vc1_inv_trans_8x8_dc = ff_simple_idct_add; |
|
341 |
+ v->s.dsp.vc1_inv_trans_8x4_dc = ff_simple_idct84_add; |
|
342 |
+ v->s.dsp.vc1_inv_trans_4x8_dc = ff_simple_idct48_add; |
|
343 |
+ v->s.dsp.vc1_inv_trans_4x4_dc = ff_simple_idct44_add; |
|
340 | 344 |
} |
341 | 345 |
|
342 | 346 |
v->fastuvmc = get_bits1(gb); //common |
... | ... |
@@ -2028,8 +2028,12 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan |
2028 | 2028 |
block[idx] += (block[idx] < 0) ? -mquant : mquant; |
2029 | 2029 |
} |
2030 | 2030 |
if(!skip_block){ |
2031 |
- s->dsp.vc1_inv_trans_8x8(block); |
|
2032 |
- s->dsp.add_pixels_clamped(block, dst, linesize); |
|
2031 |
+ if(i==1) |
|
2032 |
+ s->dsp.vc1_inv_trans_8x8_dc(dst, linesize, block); |
|
2033 |
+ else{ |
|
2034 |
+ s->dsp.vc1_inv_trans_8x8(block); |
|
2035 |
+ s->dsp.add_pixels_clamped(block, dst, linesize); |
|
2036 |
+ } |
|
2033 | 2037 |
if(apply_filter && cbp_top & 0xC) |
2034 | 2038 |
s->dsp.vc1_v_loop_filter8(dst, linesize, v->pq); |
2035 | 2039 |
if(apply_filter && cbp_left & 0xA) |
... | ... |
@@ -2053,7 +2057,10 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan |
2053 | 2053 |
block[idx + off] += (block[idx + off] < 0) ? -mquant : mquant; |
2054 | 2054 |
} |
2055 | 2055 |
if(!(subblkpat & (1 << (3 - j))) && !skip_block){ |
2056 |
- s->dsp.vc1_inv_trans_4x4(dst + (j&1)*4 + (j&2)*2*linesize, linesize, block + off); |
|
2056 |
+ if(i==1) |
|
2057 |
+ s->dsp.vc1_inv_trans_4x4_dc(dst + (j&1)*4 + (j&2)*2*linesize, linesize, block + off); |
|
2058 |
+ else |
|
2059 |
+ s->dsp.vc1_inv_trans_4x4(dst + (j&1)*4 + (j&2)*2*linesize, linesize, block + off); |
|
2057 | 2060 |
if(apply_filter && (j&2 ? pat & (1<<(j-2)) : (cbp_top & (1 << (j + 2))))) |
2058 | 2061 |
s->dsp.vc1_v_loop_filter4(dst + (j&1)*4 + (j&2)*2*linesize, linesize, v->pq); |
2059 | 2062 |
if(apply_filter && (j&1 ? pat & (1<<(j-1)) : (cbp_left & (1 << (j + 1))))) |
... | ... |
@@ -2078,7 +2085,10 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan |
2078 | 2078 |
block[idx] += (block[idx] < 0) ? -mquant : mquant; |
2079 | 2079 |
} |
2080 | 2080 |
if(!(subblkpat & (1 << (1 - j))) && !skip_block){ |
2081 |
- s->dsp.vc1_inv_trans_8x4(dst + j*4*linesize, linesize, block + off); |
|
2081 |
+ if(i==1) |
|
2082 |
+ s->dsp.vc1_inv_trans_8x4_dc(dst + j*4*linesize, linesize, block + off); |
|
2083 |
+ else |
|
2084 |
+ s->dsp.vc1_inv_trans_8x4(dst + j*4*linesize, linesize, block + off); |
|
2082 | 2085 |
if(apply_filter && j ? pat & 0x3 : (cbp_top & 0xC)) |
2083 | 2086 |
s->dsp.vc1_v_loop_filter8(dst + j*4*linesize, linesize, v->pq); |
2084 | 2087 |
if(apply_filter && cbp_left & (2 << j)) |
... | ... |
@@ -2103,7 +2113,10 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan |
2103 | 2103 |
block[idx] += (block[idx] < 0) ? -mquant : mquant; |
2104 | 2104 |
} |
2105 | 2105 |
if(!(subblkpat & (1 << (1 - j))) && !skip_block){ |
2106 |
- s->dsp.vc1_inv_trans_4x8(dst + j*4, linesize, block + off); |
|
2106 |
+ if(i==1) |
|
2107 |
+ s->dsp.vc1_inv_trans_4x8_dc(dst + j*4, linesize, block + off); |
|
2108 |
+ else |
|
2109 |
+ s->dsp.vc1_inv_trans_4x8(dst + j*4, linesize, block + off); |
|
2107 | 2110 |
if(apply_filter && cbp_top & (2 << j)) |
2108 | 2111 |
s->dsp.vc1_v_loop_filter4(dst + j*4, linesize, v->pq); |
2109 | 2112 |
if(apply_filter && j ? pat & 0x5 : (cbp_left & 0xA)) |
... | ... |
@@ -178,6 +178,26 @@ static void vc1_h_loop_filter16_c(uint8_t *src, int stride, int pq) |
178 | 178 |
|
179 | 179 |
/** Do inverse transform on 8x8 block |
180 | 180 |
*/ |
181 |
+static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block) |
|
182 |
+{ |
|
183 |
+ int i; |
|
184 |
+ int dc = block[0]; |
|
185 |
+ const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
|
186 |
+ dc = (3 * dc + 1) >> 1; |
|
187 |
+ dc = (3 * dc + 16) >> 5; |
|
188 |
+ for(i = 0; i < 8; i++){ |
|
189 |
+ dest[0] = cm[dest[0]+dc]; |
|
190 |
+ dest[1] = cm[dest[1]+dc]; |
|
191 |
+ dest[2] = cm[dest[2]+dc]; |
|
192 |
+ dest[3] = cm[dest[3]+dc]; |
|
193 |
+ dest[4] = cm[dest[4]+dc]; |
|
194 |
+ dest[5] = cm[dest[5]+dc]; |
|
195 |
+ dest[6] = cm[dest[6]+dc]; |
|
196 |
+ dest[7] = cm[dest[7]+dc]; |
|
197 |
+ dest += linesize; |
|
198 |
+ } |
|
199 |
+} |
|
200 |
+ |
|
181 | 201 |
static void vc1_inv_trans_8x8_c(DCTELEM block[64]) |
182 | 202 |
{ |
183 | 203 |
int i; |
... | ... |
@@ -249,6 +269,26 @@ static void vc1_inv_trans_8x8_c(DCTELEM block[64]) |
249 | 249 |
|
250 | 250 |
/** Do inverse transform on 8x4 part of block |
251 | 251 |
*/ |
252 |
+static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block) |
|
253 |
+{ |
|
254 |
+ int i; |
|
255 |
+ int dc = block[0]; |
|
256 |
+ const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
|
257 |
+ dc = ( 3 * dc + 1) >> 1; |
|
258 |
+ dc = (17 * dc + 64) >> 7; |
|
259 |
+ for(i = 0; i < 4; i++){ |
|
260 |
+ dest[0] = cm[dest[0]+dc]; |
|
261 |
+ dest[1] = cm[dest[1]+dc]; |
|
262 |
+ dest[2] = cm[dest[2]+dc]; |
|
263 |
+ dest[3] = cm[dest[3]+dc]; |
|
264 |
+ dest[4] = cm[dest[4]+dc]; |
|
265 |
+ dest[5] = cm[dest[5]+dc]; |
|
266 |
+ dest[6] = cm[dest[6]+dc]; |
|
267 |
+ dest[7] = cm[dest[7]+dc]; |
|
268 |
+ dest += linesize; |
|
269 |
+ } |
|
270 |
+} |
|
271 |
+ |
|
252 | 272 |
static void vc1_inv_trans_8x4_c(uint8_t *dest, int linesize, DCTELEM *block) |
253 | 273 |
{ |
254 | 274 |
int i; |
... | ... |
@@ -306,6 +346,22 @@ static void vc1_inv_trans_8x4_c(uint8_t *dest, int linesize, DCTELEM *block) |
306 | 306 |
|
307 | 307 |
/** Do inverse transform on 4x8 parts of block |
308 | 308 |
*/ |
309 |
+static void vc1_inv_trans_4x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block) |
|
310 |
+{ |
|
311 |
+ int i; |
|
312 |
+ int dc = block[0]; |
|
313 |
+ const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
|
314 |
+ dc = (17 * dc + 4) >> 3; |
|
315 |
+ dc = (12 * dc + 64) >> 7; |
|
316 |
+ for(i = 0; i < 8; i++){ |
|
317 |
+ dest[0] = cm[dest[0]+dc]; |
|
318 |
+ dest[1] = cm[dest[1]+dc]; |
|
319 |
+ dest[2] = cm[dest[2]+dc]; |
|
320 |
+ dest[3] = cm[dest[3]+dc]; |
|
321 |
+ dest += linesize; |
|
322 |
+ } |
|
323 |
+} |
|
324 |
+ |
|
309 | 325 |
static void vc1_inv_trans_4x8_c(uint8_t *dest, int linesize, DCTELEM *block) |
310 | 326 |
{ |
311 | 327 |
int i; |
... | ... |
@@ -363,6 +419,22 @@ static void vc1_inv_trans_4x8_c(uint8_t *dest, int linesize, DCTELEM *block) |
363 | 363 |
|
364 | 364 |
/** Do inverse transform on 4x4 part of block |
365 | 365 |
*/ |
366 |
+static void vc1_inv_trans_4x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block) |
|
367 |
+{ |
|
368 |
+ int i; |
|
369 |
+ int dc = block[0]; |
|
370 |
+ const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
|
371 |
+ dc = (17 * dc + 4) >> 3; |
|
372 |
+ dc = (17 * dc + 64) >> 7; |
|
373 |
+ for(i = 0; i < 4; i++){ |
|
374 |
+ dest[0] = cm[dest[0]+dc]; |
|
375 |
+ dest[1] = cm[dest[1]+dc]; |
|
376 |
+ dest[2] = cm[dest[2]+dc]; |
|
377 |
+ dest[3] = cm[dest[3]+dc]; |
|
378 |
+ dest += linesize; |
|
379 |
+ } |
|
380 |
+} |
|
381 |
+ |
|
366 | 382 |
static void vc1_inv_trans_4x4_c(uint8_t *dest, int linesize, DCTELEM *block) |
367 | 383 |
{ |
368 | 384 |
int i; |
... | ... |
@@ -545,6 +617,10 @@ void ff_vc1dsp_init(DSPContext* dsp, AVCodecContext *avctx) { |
545 | 545 |
dsp->vc1_inv_trans_4x8 = vc1_inv_trans_4x8_c; |
546 | 546 |
dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_c; |
547 | 547 |
dsp->vc1_inv_trans_4x4 = vc1_inv_trans_4x4_c; |
548 |
+ dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_c; |
|
549 |
+ dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_c; |
|
550 |
+ dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_c; |
|
551 |
+ dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_c; |
|
548 | 552 |
dsp->vc1_h_overlap = vc1_h_overlap_c; |
549 | 553 |
dsp->vc1_v_overlap = vc1_v_overlap_c; |
550 | 554 |
dsp->vc1_v_loop_filter4 = vc1_v_loop_filter4_c; |
... | ... |
@@ -494,6 +494,204 @@ DECLARE_FUNCTION(3, 1) |
494 | 494 |
DECLARE_FUNCTION(3, 2) |
495 | 495 |
DECLARE_FUNCTION(3, 3) |
496 | 496 |
|
497 |
+static void vc1_inv_trans_4x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
|
498 |
+{ |
|
499 |
+ int dc = block[0]; |
|
500 |
+ dc = (17 * dc + 4) >> 3; |
|
501 |
+ dc = (17 * dc + 64) >> 7; |
|
502 |
+ __asm__ volatile( |
|
503 |
+ "movd %0, %%mm0 \n\t" |
|
504 |
+ "pshufw $0, %%mm0, %%mm0 \n\t" |
|
505 |
+ "pxor %%mm1, %%mm1 \n\t" |
|
506 |
+ "psubw %%mm0, %%mm1 \n\t" |
|
507 |
+ "packuswb %%mm0, %%mm0 \n\t" |
|
508 |
+ "packuswb %%mm1, %%mm1 \n\t" |
|
509 |
+ ::"r"(dc) |
|
510 |
+ ); |
|
511 |
+ __asm__ volatile( |
|
512 |
+ "movd %0, %%mm2 \n\t" |
|
513 |
+ "movd %1, %%mm3 \n\t" |
|
514 |
+ "movd %2, %%mm4 \n\t" |
|
515 |
+ "movd %3, %%mm5 \n\t" |
|
516 |
+ "paddusb %%mm0, %%mm2 \n\t" |
|
517 |
+ "paddusb %%mm0, %%mm3 \n\t" |
|
518 |
+ "paddusb %%mm0, %%mm4 \n\t" |
|
519 |
+ "paddusb %%mm0, %%mm5 \n\t" |
|
520 |
+ "psubusb %%mm1, %%mm2 \n\t" |
|
521 |
+ "psubusb %%mm1, %%mm3 \n\t" |
|
522 |
+ "psubusb %%mm1, %%mm4 \n\t" |
|
523 |
+ "psubusb %%mm1, %%mm5 \n\t" |
|
524 |
+ "movd %%mm2, %0 \n\t" |
|
525 |
+ "movd %%mm3, %1 \n\t" |
|
526 |
+ "movd %%mm4, %2 \n\t" |
|
527 |
+ "movd %%mm5, %3 \n\t" |
|
528 |
+ :"+m"(*(uint32_t*)(dest+0*linesize)), |
|
529 |
+ "+m"(*(uint32_t*)(dest+1*linesize)), |
|
530 |
+ "+m"(*(uint32_t*)(dest+2*linesize)), |
|
531 |
+ "+m"(*(uint32_t*)(dest+3*linesize)) |
|
532 |
+ ); |
|
533 |
+} |
|
534 |
+ |
|
535 |
+static void vc1_inv_trans_4x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
|
536 |
+{ |
|
537 |
+ int dc = block[0]; |
|
538 |
+ dc = (17 * dc + 4) >> 3; |
|
539 |
+ dc = (12 * dc + 64) >> 7; |
|
540 |
+ __asm__ volatile( |
|
541 |
+ "movd %0, %%mm0 \n\t" |
|
542 |
+ "pshufw $0, %%mm0, %%mm0 \n\t" |
|
543 |
+ "pxor %%mm1, %%mm1 \n\t" |
|
544 |
+ "psubw %%mm0, %%mm1 \n\t" |
|
545 |
+ "packuswb %%mm0, %%mm0 \n\t" |
|
546 |
+ "packuswb %%mm1, %%mm1 \n\t" |
|
547 |
+ ::"r"(dc) |
|
548 |
+ ); |
|
549 |
+ __asm__ volatile( |
|
550 |
+ "movd %0, %%mm2 \n\t" |
|
551 |
+ "movd %1, %%mm3 \n\t" |
|
552 |
+ "movd %2, %%mm4 \n\t" |
|
553 |
+ "movd %3, %%mm5 \n\t" |
|
554 |
+ "paddusb %%mm0, %%mm2 \n\t" |
|
555 |
+ "paddusb %%mm0, %%mm3 \n\t" |
|
556 |
+ "paddusb %%mm0, %%mm4 \n\t" |
|
557 |
+ "paddusb %%mm0, %%mm5 \n\t" |
|
558 |
+ "psubusb %%mm1, %%mm2 \n\t" |
|
559 |
+ "psubusb %%mm1, %%mm3 \n\t" |
|
560 |
+ "psubusb %%mm1, %%mm4 \n\t" |
|
561 |
+ "psubusb %%mm1, %%mm5 \n\t" |
|
562 |
+ "movd %%mm2, %0 \n\t" |
|
563 |
+ "movd %%mm3, %1 \n\t" |
|
564 |
+ "movd %%mm4, %2 \n\t" |
|
565 |
+ "movd %%mm5, %3 \n\t" |
|
566 |
+ :"+m"(*(uint32_t*)(dest+0*linesize)), |
|
567 |
+ "+m"(*(uint32_t*)(dest+1*linesize)), |
|
568 |
+ "+m"(*(uint32_t*)(dest+2*linesize)), |
|
569 |
+ "+m"(*(uint32_t*)(dest+3*linesize)) |
|
570 |
+ ); |
|
571 |
+ dest += 4*linesize; |
|
572 |
+ __asm__ volatile( |
|
573 |
+ "movd %0, %%mm2 \n\t" |
|
574 |
+ "movd %1, %%mm3 \n\t" |
|
575 |
+ "movd %2, %%mm4 \n\t" |
|
576 |
+ "movd %3, %%mm5 \n\t" |
|
577 |
+ "paddusb %%mm0, %%mm2 \n\t" |
|
578 |
+ "paddusb %%mm0, %%mm3 \n\t" |
|
579 |
+ "paddusb %%mm0, %%mm4 \n\t" |
|
580 |
+ "paddusb %%mm0, %%mm5 \n\t" |
|
581 |
+ "psubusb %%mm1, %%mm2 \n\t" |
|
582 |
+ "psubusb %%mm1, %%mm3 \n\t" |
|
583 |
+ "psubusb %%mm1, %%mm4 \n\t" |
|
584 |
+ "psubusb %%mm1, %%mm5 \n\t" |
|
585 |
+ "movd %%mm2, %0 \n\t" |
|
586 |
+ "movd %%mm3, %1 \n\t" |
|
587 |
+ "movd %%mm4, %2 \n\t" |
|
588 |
+ "movd %%mm5, %3 \n\t" |
|
589 |
+ :"+m"(*(uint32_t*)(dest+0*linesize)), |
|
590 |
+ "+m"(*(uint32_t*)(dest+1*linesize)), |
|
591 |
+ "+m"(*(uint32_t*)(dest+2*linesize)), |
|
592 |
+ "+m"(*(uint32_t*)(dest+3*linesize)) |
|
593 |
+ ); |
|
594 |
+} |
|
595 |
+ |
|
596 |
+static void vc1_inv_trans_8x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
|
597 |
+{ |
|
598 |
+ int dc = block[0]; |
|
599 |
+ dc = ( 3 * dc + 1) >> 1; |
|
600 |
+ dc = (17 * dc + 64) >> 7; |
|
601 |
+ __asm__ volatile( |
|
602 |
+ "movd %0, %%mm0 \n\t" |
|
603 |
+ "pshufw $0, %%mm0, %%mm0 \n\t" |
|
604 |
+ "pxor %%mm1, %%mm1 \n\t" |
|
605 |
+ "psubw %%mm0, %%mm1 \n\t" |
|
606 |
+ "packuswb %%mm0, %%mm0 \n\t" |
|
607 |
+ "packuswb %%mm1, %%mm1 \n\t" |
|
608 |
+ ::"r"(dc) |
|
609 |
+ ); |
|
610 |
+ __asm__ volatile( |
|
611 |
+ "movq %0, %%mm2 \n\t" |
|
612 |
+ "movq %1, %%mm3 \n\t" |
|
613 |
+ "movq %2, %%mm4 \n\t" |
|
614 |
+ "movq %3, %%mm5 \n\t" |
|
615 |
+ "paddusb %%mm0, %%mm2 \n\t" |
|
616 |
+ "paddusb %%mm0, %%mm3 \n\t" |
|
617 |
+ "paddusb %%mm0, %%mm4 \n\t" |
|
618 |
+ "paddusb %%mm0, %%mm5 \n\t" |
|
619 |
+ "psubusb %%mm1, %%mm2 \n\t" |
|
620 |
+ "psubusb %%mm1, %%mm3 \n\t" |
|
621 |
+ "psubusb %%mm1, %%mm4 \n\t" |
|
622 |
+ "psubusb %%mm1, %%mm5 \n\t" |
|
623 |
+ "movq %%mm2, %0 \n\t" |
|
624 |
+ "movq %%mm3, %1 \n\t" |
|
625 |
+ "movq %%mm4, %2 \n\t" |
|
626 |
+ "movq %%mm5, %3 \n\t" |
|
627 |
+ :"+m"(*(uint32_t*)(dest+0*linesize)), |
|
628 |
+ "+m"(*(uint32_t*)(dest+1*linesize)), |
|
629 |
+ "+m"(*(uint32_t*)(dest+2*linesize)), |
|
630 |
+ "+m"(*(uint32_t*)(dest+3*linesize)) |
|
631 |
+ ); |
|
632 |
+} |
|
633 |
+ |
|
634 |
+static void vc1_inv_trans_8x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
|
635 |
+{ |
|
636 |
+ int dc = block[0]; |
|
637 |
+ dc = (3 * dc + 1) >> 1; |
|
638 |
+ dc = (3 * dc + 16) >> 5; |
|
639 |
+ __asm__ volatile( |
|
640 |
+ "movd %0, %%mm0 \n\t" |
|
641 |
+ "pshufw $0, %%mm0, %%mm0 \n\t" |
|
642 |
+ "pxor %%mm1, %%mm1 \n\t" |
|
643 |
+ "psubw %%mm0, %%mm1 \n\t" |
|
644 |
+ "packuswb %%mm0, %%mm0 \n\t" |
|
645 |
+ "packuswb %%mm1, %%mm1 \n\t" |
|
646 |
+ ::"r"(dc) |
|
647 |
+ ); |
|
648 |
+ __asm__ volatile( |
|
649 |
+ "movq %0, %%mm2 \n\t" |
|
650 |
+ "movq %1, %%mm3 \n\t" |
|
651 |
+ "movq %2, %%mm4 \n\t" |
|
652 |
+ "movq %3, %%mm5 \n\t" |
|
653 |
+ "paddusb %%mm0, %%mm2 \n\t" |
|
654 |
+ "paddusb %%mm0, %%mm3 \n\t" |
|
655 |
+ "paddusb %%mm0, %%mm4 \n\t" |
|
656 |
+ "paddusb %%mm0, %%mm5 \n\t" |
|
657 |
+ "psubusb %%mm1, %%mm2 \n\t" |
|
658 |
+ "psubusb %%mm1, %%mm3 \n\t" |
|
659 |
+ "psubusb %%mm1, %%mm4 \n\t" |
|
660 |
+ "psubusb %%mm1, %%mm5 \n\t" |
|
661 |
+ "movq %%mm2, %0 \n\t" |
|
662 |
+ "movq %%mm3, %1 \n\t" |
|
663 |
+ "movq %%mm4, %2 \n\t" |
|
664 |
+ "movq %%mm5, %3 \n\t" |
|
665 |
+ :"+m"(*(uint32_t*)(dest+0*linesize)), |
|
666 |
+ "+m"(*(uint32_t*)(dest+1*linesize)), |
|
667 |
+ "+m"(*(uint32_t*)(dest+2*linesize)), |
|
668 |
+ "+m"(*(uint32_t*)(dest+3*linesize)) |
|
669 |
+ ); |
|
670 |
+ dest += 4*linesize; |
|
671 |
+ __asm__ volatile( |
|
672 |
+ "movq %0, %%mm2 \n\t" |
|
673 |
+ "movq %1, %%mm3 \n\t" |
|
674 |
+ "movq %2, %%mm4 \n\t" |
|
675 |
+ "movq %3, %%mm5 \n\t" |
|
676 |
+ "paddusb %%mm0, %%mm2 \n\t" |
|
677 |
+ "paddusb %%mm0, %%mm3 \n\t" |
|
678 |
+ "paddusb %%mm0, %%mm4 \n\t" |
|
679 |
+ "paddusb %%mm0, %%mm5 \n\t" |
|
680 |
+ "psubusb %%mm1, %%mm2 \n\t" |
|
681 |
+ "psubusb %%mm1, %%mm3 \n\t" |
|
682 |
+ "psubusb %%mm1, %%mm4 \n\t" |
|
683 |
+ "psubusb %%mm1, %%mm5 \n\t" |
|
684 |
+ "movq %%mm2, %0 \n\t" |
|
685 |
+ "movq %%mm3, %1 \n\t" |
|
686 |
+ "movq %%mm4, %2 \n\t" |
|
687 |
+ "movq %%mm5, %3 \n\t" |
|
688 |
+ :"+m"(*(uint32_t*)(dest+0*linesize)), |
|
689 |
+ "+m"(*(uint32_t*)(dest+1*linesize)), |
|
690 |
+ "+m"(*(uint32_t*)(dest+2*linesize)), |
|
691 |
+ "+m"(*(uint32_t*)(dest+3*linesize)) |
|
692 |
+ ); |
|
693 |
+} |
|
694 |
+ |
|
497 | 695 |
void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { |
498 | 696 |
mm_flags = mm_support(); |
499 | 697 |
|
... | ... |
@@ -537,5 +735,10 @@ void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { |
537 | 537 |
dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2; |
538 | 538 |
dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2; |
539 | 539 |
dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2; |
540 |
+ |
|
541 |
+ dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2; |
|
542 |
+ dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2; |
|
543 |
+ dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2; |
|
544 |
+ dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2; |
|
540 | 545 |
} |
541 | 546 |
} |